]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/ceph.py
update sources to v12.1.3
[ceph.git] / ceph / qa / tasks / ceph.py
CommitLineData
7c673cae
FG
1"""
2Ceph cluster task.
3
4Handle the setup, starting, and clean-up of a Ceph cluster.
5"""
6from cStringIO import StringIO
7
8import argparse
9import contextlib
10import errno
11import logging
12import os
13import json
14import time
15import gevent
16import socket
17
18from paramiko import SSHException
19from ceph_manager import CephManager, write_conf
20from tasks.cephfs.filesystem import Filesystem
21from teuthology import misc as teuthology
22from teuthology import contextutil
23from teuthology import exceptions
24from teuthology.orchestra import run
25import ceph_client as cclient
26from teuthology.orchestra.daemon import DaemonGroup
27
28CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
29
30log = logging.getLogger(__name__)
31
32
33def generate_caps(type_):
34 """
35 Each call will return the next capability for each system type
36 (essentially a subset of possible role values). Valid types are osd,
37 mds and client.
38 """
39 defaults = dict(
40 osd=dict(
41 mon='allow *',
42 mgr='allow *',
43 osd='allow *',
44 ),
45 mgr=dict(
46 mon='allow *',
47 ),
48 mds=dict(
49 mon='allow *',
50 mgr='allow *',
51 osd='allow *',
52 mds='allow',
53 ),
54 client=dict(
55 mon='allow rw',
56 mgr='allow r',
57 osd='allow rwx',
58 mds='allow',
59 ),
60 )
61 for subsystem, capability in defaults[type_].items():
62 yield '--cap'
63 yield subsystem
64 yield capability
65
66
67@contextlib.contextmanager
68def ceph_log(ctx, config):
69 """
70 Create /var/log/ceph log directory that is open to everyone.
71 Add valgrind and profiling-logger directories.
72
73 :param ctx: Context
74 :param config: Configuration
75 """
76 log.info('Making ceph log dir writeable by non-root...')
77 run.wait(
78 ctx.cluster.run(
79 args=[
80 'sudo',
81 'chmod',
82 '777',
83 '/var/log/ceph',
84 ],
85 wait=False,
86 )
87 )
88 log.info('Disabling ceph logrotate...')
89 run.wait(
90 ctx.cluster.run(
91 args=[
92 'sudo',
93 'rm', '-f', '--',
94 '/etc/logrotate.d/ceph',
95 ],
96 wait=False,
97 )
98 )
99 log.info('Creating extra log directories...')
100 run.wait(
101 ctx.cluster.run(
102 args=[
103 'sudo',
104 'install', '-d', '-m0777', '--',
105 '/var/log/ceph/valgrind',
106 '/var/log/ceph/profiling-logger',
107 ],
108 wait=False,
109 )
110 )
111
112 class Rotater(object):
113 stop_event = gevent.event.Event()
114
115 def invoke_logrotate(self):
116 # 1) install ceph-test.conf in /etc/logrotate.d
117 # 2) continuously loop over logrotate invocation with ceph-test.conf
118 while not self.stop_event.is_set():
119 self.stop_event.wait(timeout=30)
120 try:
121 run.wait(
122 ctx.cluster.run(
123 args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
124 ],
125 wait=False,
126 )
127 )
128 except exceptions.ConnectionLostError as e:
129 # Some tests may power off nodes during test, in which
130 # case we will see connection errors that we should ignore.
131 log.debug("Missed logrotate, node '{0}' is offline".format(
132 e.node))
133 except EOFError as e:
134 # Paramiko sometimes raises this when it fails to
135 # connect to a node during open_session. As with
136 # ConnectionLostError, we ignore this because nodes
137 # are allowed to get power cycled during tests.
138 log.debug("Missed logrotate, EOFError")
139 except SSHException as e:
140 log.debug("Missed logrotate, SSHException")
141 except socket.error as e:
142 if e.errno == errno.EHOSTUNREACH:
143 log.debug("Missed logrotate, host unreachable")
144 else:
145 raise
146
147 def begin(self):
148 self.thread = gevent.spawn(self.invoke_logrotate)
149
150 def end(self):
151 self.stop_event.set()
152 self.thread.get()
153
154 def write_rotate_conf(ctx, daemons):
155 testdir = teuthology.get_testdir(ctx)
156 rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
157 with file(rotate_conf_path, 'rb') as f:
158 conf = ""
159 for daemon, size in daemons.iteritems():
160 log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
161 conf += f.read().format(daemon_type=daemon, max_size=size)
162 f.seek(0, 0)
163
164 for remote in ctx.cluster.remotes.iterkeys():
165 teuthology.write_file(remote=remote,
166 path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
167 data=StringIO(conf)
168 )
169 remote.run(
170 args=[
171 'sudo',
172 'mv',
173 '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
174 '/etc/logrotate.d/ceph-test.conf',
175 run.Raw('&&'),
176 'sudo',
177 'chmod',
178 '0644',
179 '/etc/logrotate.d/ceph-test.conf',
180 run.Raw('&&'),
181 'sudo',
182 'chown',
183 'root.root',
184 '/etc/logrotate.d/ceph-test.conf'
185 ]
186 )
187 remote.chcon('/etc/logrotate.d/ceph-test.conf',
188 'system_u:object_r:etc_t:s0')
189
190 if ctx.config.get('log-rotate'):
191 daemons = ctx.config.get('log-rotate')
192 log.info('Setting up log rotation with ' + str(daemons))
193 write_rotate_conf(ctx, daemons)
194 logrotater = Rotater()
195 logrotater.begin()
196 try:
197 yield
198
199 finally:
200 if ctx.config.get('log-rotate'):
201 log.info('Shutting down logrotate')
202 logrotater.end()
203 ctx.cluster.run(
204 args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
205 ]
206 )
207 if ctx.archive is not None and \
208 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
209 # and logs
210 log.info('Compressing logs...')
211 run.wait(
212 ctx.cluster.run(
213 args=[
214 'sudo',
215 'find',
216 '/var/log/ceph',
217 '-name',
218 '*.log',
219 '-print0',
220 run.Raw('|'),
221 'sudo',
222 'xargs',
223 '-0',
224 '--no-run-if-empty',
225 '--',
226 'gzip',
227 '--',
228 ],
229 wait=False,
230 ),
231 )
232
233 log.info('Archiving logs...')
234 path = os.path.join(ctx.archive, 'remote')
235 os.makedirs(path)
236 for remote in ctx.cluster.remotes.iterkeys():
237 sub = os.path.join(path, remote.shortname)
238 os.makedirs(sub)
239 teuthology.pull_directory(remote, '/var/log/ceph',
240 os.path.join(sub, 'log'))
241
242
243def assign_devs(roles, devs):
244 """
245 Create a dictionary of devs indexed by roles
246
247 :param roles: List of roles
248 :param devs: Corresponding list of devices.
249 :returns: Dictionary of devs indexed by roles.
250 """
251 return dict(zip(roles, devs))
252
253
254@contextlib.contextmanager
255def valgrind_post(ctx, config):
256 """
257 After the tests run, look throught all the valgrind logs. Exceptions are raised
258 if textual errors occured in the logs, or if valgrind exceptions were detected in
259 the logs.
260
261 :param ctx: Context
262 :param config: Configuration
263 """
264 try:
265 yield
266 finally:
267 lookup_procs = list()
268 log.info('Checking for errors in any valgrind logs...')
269 for remote in ctx.cluster.remotes.iterkeys():
270 # look at valgrind logs for each node
271 proc = remote.run(
272 args=[
273 'sudo',
274 'zgrep',
275 '<kind>',
276 run.Raw('/var/log/ceph/valgrind/*'),
277 '/dev/null', # include a second file so that we always get a filename prefix on the output
278 run.Raw('|'),
279 'sort',
280 run.Raw('|'),
281 'uniq',
282 ],
283 wait=False,
284 check_status=False,
285 stdout=StringIO(),
286 )
287 lookup_procs.append((proc, remote))
288
289 valgrind_exception = None
290 for (proc, remote) in lookup_procs:
291 proc.wait()
292 out = proc.stdout.getvalue()
293 for line in out.split('\n'):
294 if line == '':
295 continue
296 try:
297 (file, kind) = line.split(':')
298 except Exception:
299 log.error('failed to split line %s', line)
300 raise
301 log.debug('file %s kind %s', file, kind)
302 if (file.find('mds') >= 0) and kind.find('Lost') > 0:
303 continue
304 log.error('saw valgrind issue %s in %s', kind, file)
305 valgrind_exception = Exception('saw valgrind issues')
306
307 if config.get('expect_valgrind_errors'):
308 if not valgrind_exception:
309 raise Exception('expected valgrind issues and found none')
310 else:
311 if valgrind_exception:
312 raise valgrind_exception
313
314
315@contextlib.contextmanager
316def crush_setup(ctx, config):
317 cluster_name = config['cluster']
318 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
319 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
320
321 profile = config.get('crush_tunables', 'default')
322 log.info('Setting crush tunables to %s', profile)
323 mon_remote.run(
324 args=['sudo', 'ceph', '--cluster', cluster_name,
325 'osd', 'crush', 'tunables', profile])
326 yield
327
328
224ce89b
WB
329@contextlib.contextmanager
330def create_rbd_pool(ctx, config):
331 cluster_name = config['cluster']
332 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
333 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
334 log.info('Waiting for OSDs to come up')
335 teuthology.wait_until_osds_up(
336 ctx,
337 cluster=ctx.cluster,
338 remote=mon_remote,
339 ceph_cluster=cluster_name,
340 )
341 log.info('Creating RBD pool')
342 mon_remote.run(
343 args=['sudo', 'ceph', '--cluster', cluster_name,
344 'osd', 'pool', 'create', 'rbd', '8'])
d2e6a577
FG
345 mon_remote.run(
346 args=[
347 'sudo', 'ceph', '--cluster', cluster_name,
348 'osd', 'pool', 'application', 'enable',
349 'rbd', 'rbd', '--yes-i-really-mean-it'
350 ],
351 check_status=False)
224ce89b
WB
352 yield
353
7c673cae
FG
354@contextlib.contextmanager
355def cephfs_setup(ctx, config):
356 cluster_name = config['cluster']
357 testdir = teuthology.get_testdir(ctx)
358 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
359
360 first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
361 (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
362 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
363 # If there are any MDSs, then create a filesystem for them to use
364 # Do this last because requires mon cluster to be up and running
365 if mdss.remotes:
366 log.info('Setting up CephFS filesystem...')
367
368 fs = Filesystem(ctx, create='cephfs')
369
370 is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
371 all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
372 num_active = len([r for r in all_roles if is_active_mds(r)])
373
7c673cae
FG
374 fs.set_max_mds(num_active)
375 fs.set_allow_dirfrags(True)
376
377 yield
378
379
380@contextlib.contextmanager
381def cluster(ctx, config):
382 """
383 Handle the creation and removal of a ceph cluster.
384
385 On startup:
386 Create directories needed for the cluster.
387 Create remote journals for all osds.
388 Create and set keyring.
389 Copy the monmap to tht test systems.
390 Setup mon nodes.
391 Setup mds nodes.
392 Mkfs osd nodes.
393 Add keyring information to monmaps
394 Mkfs mon nodes.
395
396 On exit:
397 If errors occured, extract a failure message and store in ctx.summary.
398 Unmount all test files and temporary journaling files.
399 Save the monitor information and archive all ceph logs.
400 Cleanup the keyring setup, and remove all monitor map and data files left over.
401
402 :param ctx: Context
403 :param config: Configuration
404 """
405 if ctx.config.get('use_existing_cluster', False) is True:
406 log.info("'use_existing_cluster' is true; skipping cluster creation")
407 yield
408
409 testdir = teuthology.get_testdir(ctx)
410 cluster_name = config['cluster']
411 data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
412 log.info('Creating ceph cluster %s...', cluster_name)
413 run.wait(
414 ctx.cluster.run(
415 args=[
416 'install', '-d', '-m0755', '--',
417 data_dir,
418 ],
419 wait=False,
420 )
421 )
422
423 run.wait(
424 ctx.cluster.run(
425 args=[
426 'sudo',
427 'install', '-d', '-m0777', '--', '/var/run/ceph',
428 ],
429 wait=False,
430 )
431 )
432
433 devs_to_clean = {}
434 remote_to_roles_to_devs = {}
435 remote_to_roles_to_journals = {}
436 osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
437 for remote, roles_for_host in osds.remotes.iteritems():
438 devs = teuthology.get_scratch_devices(remote)
439 roles_to_devs = {}
440 roles_to_journals = {}
441 if config.get('fs'):
442 log.info('fs option selected, checking for scratch devs')
443 log.info('found devs: %s' % (str(devs),))
444 devs_id_map = teuthology.get_wwn_id_map(remote, devs)
445 iddevs = devs_id_map.values()
446 roles_to_devs = assign_devs(
447 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
448 )
449 if len(roles_to_devs) < len(iddevs):
450 iddevs = iddevs[len(roles_to_devs):]
451 devs_to_clean[remote] = []
452
453 if config.get('block_journal'):
454 log.info('block journal enabled')
455 roles_to_journals = assign_devs(
456 teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
457 )
458 log.info('journal map: %s', roles_to_journals)
459
460 if config.get('tmpfs_journal'):
461 log.info('tmpfs journal enabled')
462 roles_to_journals = {}
463 remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
464 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
465 tmpfs = '/mnt/' + role
466 roles_to_journals[role] = tmpfs
467 remote.run(args=['truncate', '-s', '1500M', tmpfs])
468 log.info('journal map: %s', roles_to_journals)
469
470 log.info('dev map: %s' % (str(roles_to_devs),))
471 remote_to_roles_to_devs[remote] = roles_to_devs
472 remote_to_roles_to_journals[remote] = roles_to_journals
473
474 log.info('Generating config...')
475 remotes_and_roles = ctx.cluster.remotes.items()
476 roles = [role_list for (remote, role_list) in remotes_and_roles]
477 ips = [host for (host, port) in
478 (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
479 conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
480 for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
481 for role, journal in roles_to_journals.iteritems():
482 name = teuthology.ceph_role(role)
483 if name not in conf:
484 conf[name] = {}
485 conf[name]['osd journal'] = journal
486 for section, keys in config['conf'].iteritems():
487 for key, value in keys.iteritems():
488 log.info("[%s] %s = %s" % (section, key, value))
489 if section not in conf:
490 conf[section] = {}
491 conf[section][key] = value
492
493 if config.get('tmpfs_journal'):
494 conf['journal dio'] = False
495
496 if not hasattr(ctx, 'ceph'):
497 ctx.ceph = {}
498 ctx.ceph[cluster_name] = argparse.Namespace()
499 ctx.ceph[cluster_name].conf = conf
500
501 default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
502 keyring_path = config.get('keyring_path', default_keyring)
503
504 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
505
506 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
507
508 log.info('Setting up %s...' % firstmon)
509 ctx.cluster.only(firstmon).run(
510 args=[
511 'sudo',
512 'adjust-ulimits',
513 'ceph-coverage',
514 coverage_dir,
515 'ceph-authtool',
516 '--create-keyring',
517 keyring_path,
518 ],
519 )
520 ctx.cluster.only(firstmon).run(
521 args=[
522 'sudo',
523 'adjust-ulimits',
524 'ceph-coverage',
525 coverage_dir,
526 'ceph-authtool',
527 '--gen-key',
528 '--name=mon.',
529 keyring_path,
530 ],
531 )
532 ctx.cluster.only(firstmon).run(
533 args=[
534 'sudo',
535 'chmod',
536 '0644',
537 keyring_path,
538 ],
539 )
540 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
541 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
542 cluster=cluster_name)
543 fsid = teuthology.create_simple_monmap(
544 ctx,
545 remote=mon0_remote,
546 conf=conf,
547 path=monmap_path,
548 )
549 if not 'global' in conf:
550 conf['global'] = {}
551 conf['global']['fsid'] = fsid
552
553 default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
554 conf_path = config.get('conf_path', default_conf_path)
555 log.info('Writing %s for FSID %s...' % (conf_path, fsid))
556 write_conf(ctx, conf_path, cluster_name)
557
558 log.info('Creating admin key on %s...' % firstmon)
559 ctx.cluster.only(firstmon).run(
560 args=[
561 'sudo',
562 'adjust-ulimits',
563 'ceph-coverage',
564 coverage_dir,
565 'ceph-authtool',
566 '--gen-key',
567 '--name=client.admin',
568 '--set-uid=0',
569 '--cap', 'mon', 'allow *',
570 '--cap', 'osd', 'allow *',
571 '--cap', 'mds', 'allow *',
572 '--cap', 'mgr', 'allow *',
573 keyring_path,
574 ],
575 )
576
577 log.info('Copying monmap to all nodes...')
578 keyring = teuthology.get_file(
579 remote=mon0_remote,
580 path=keyring_path,
581 )
582 monmap = teuthology.get_file(
583 remote=mon0_remote,
584 path=monmap_path,
585 )
586
587 for rem in ctx.cluster.remotes.iterkeys():
588 # copy mon key and initial monmap
589 log.info('Sending monmap to node {remote}'.format(remote=rem))
590 teuthology.sudo_write_file(
591 remote=rem,
592 path=keyring_path,
593 data=keyring,
594 perms='0644'
595 )
596 teuthology.write_file(
597 remote=rem,
598 path=monmap_path,
599 data=monmap,
600 )
601
602 log.info('Setting up mon nodes...')
603 mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
7c673cae
FG
604
605 if not config.get('skip_mgr_daemons', False):
606 log.info('Setting up mgr nodes...')
607 mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
608 for remote, roles_for_host in mgrs.remotes.iteritems():
609 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
610 cluster_name):
611 _, _, id_ = teuthology.split_role(role)
612 mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
613 cluster=cluster_name,
614 id=id_,
615 )
616 remote.run(
617 args=[
618 'sudo',
619 'mkdir',
620 '-p',
621 mgr_dir,
622 run.Raw('&&'),
623 'sudo',
624 'adjust-ulimits',
625 'ceph-coverage',
626 coverage_dir,
627 'ceph-authtool',
628 '--create-keyring',
629 '--gen-key',
630 '--name=mgr.{id}'.format(id=id_),
631 mgr_dir + '/keyring',
632 ],
633 )
634
635 log.info('Setting up mds nodes...')
636 mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
637 for remote, roles_for_host in mdss.remotes.iteritems():
638 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
639 cluster_name):
640 _, _, id_ = teuthology.split_role(role)
641 mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
642 cluster=cluster_name,
643 id=id_,
644 )
645 remote.run(
646 args=[
647 'sudo',
648 'mkdir',
649 '-p',
650 mds_dir,
651 run.Raw('&&'),
652 'sudo',
653 'adjust-ulimits',
654 'ceph-coverage',
655 coverage_dir,
656 'ceph-authtool',
657 '--create-keyring',
658 '--gen-key',
659 '--name=mds.{id}'.format(id=id_),
660 mds_dir + '/keyring',
661 ],
662 )
663
664 cclient.create_keyring(ctx, cluster_name)
665 log.info('Running mkfs on osd nodes...')
666
667 if not hasattr(ctx, 'disk_config'):
668 ctx.disk_config = argparse.Namespace()
669 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
670 ctx.disk_config.remote_to_roles_to_dev = {}
671 if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
672 ctx.disk_config.remote_to_roles_to_journals = {}
673 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
674 ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
675 if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
676 ctx.disk_config.remote_to_roles_to_dev_fstype = {}
677
678 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
679 teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
680
681 log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
682 for remote, roles_for_host in osds.remotes.iteritems():
683 roles_to_devs = remote_to_roles_to_devs[remote]
684 roles_to_journals = remote_to_roles_to_journals[remote]
685
686 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
687 _, _, id_ = teuthology.split_role(role)
688 mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
689 remote.run(
690 args=[
691 'sudo',
692 'mkdir',
693 '-p',
694 mnt_point,
695 ])
d2e6a577 696 log.info(str(roles_to_devs))
7c673cae
FG
697 log.info(str(roles_to_journals))
698 log.info(role)
699 if roles_to_devs.get(role):
700 dev = roles_to_devs[role]
701 fs = config.get('fs')
702 package = None
703 mkfs_options = config.get('mkfs_options')
704 mount_options = config.get('mount_options')
705 if fs == 'btrfs':
706 # package = 'btrfs-tools'
707 if mount_options is None:
708 mount_options = ['noatime', 'user_subvol_rm_allowed']
709 if mkfs_options is None:
710 mkfs_options = ['-m', 'single',
711 '-l', '32768',
712 '-n', '32768']
713 if fs == 'xfs':
714 # package = 'xfsprogs'
715 if mount_options is None:
716 mount_options = ['noatime']
717 if mkfs_options is None:
718 mkfs_options = ['-f', '-i', 'size=2048']
719 if fs == 'ext4' or fs == 'ext3':
720 if mount_options is None:
721 mount_options = ['noatime', 'user_xattr']
722
723 if mount_options is None:
724 mount_options = []
725 if mkfs_options is None:
726 mkfs_options = []
727 mkfs = ['mkfs.%s' % fs] + mkfs_options
728 log.info('%s on %s on %s' % (mkfs, dev, remote))
729 if package is not None:
730 remote.run(
731 args=[
732 'sudo',
733 'apt-get', 'install', '-y', package
734 ],
735 stdout=StringIO(),
736 )
737
738 try:
739 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
740 except run.CommandFailedError:
741 # Newer btfs-tools doesn't prompt for overwrite, use -f
742 if '-f' not in mount_options:
743 mkfs_options.append('-f')
744 mkfs = ['mkfs.%s' % fs] + mkfs_options
745 log.info('%s on %s on %s' % (mkfs, dev, remote))
746 remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
747
748 log.info('mount %s on %s -o %s' % (dev, remote,
749 ','.join(mount_options)))
750 remote.run(
751 args=[
752 'sudo',
753 'mount',
754 '-t', fs,
755 '-o', ','.join(mount_options),
756 dev,
757 mnt_point,
758 ]
759 )
760 remote.run(
761 args=[
762 'sudo', '/sbin/restorecon', mnt_point,
763 ],
764 check_status=False,
765 )
766 if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
767 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
768 ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
769 if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
770 ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
771 ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
772 devs_to_clean[remote].append(mnt_point)
773
774 for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
775 _, _, id_ = teuthology.split_role(role)
776 remote.run(
777 args=[
778 'sudo',
779 'MALLOC_CHECK_=3',
780 'adjust-ulimits',
781 'ceph-coverage',
782 coverage_dir,
783 'ceph-osd',
784 '--cluster',
785 cluster_name,
786 '--mkfs',
787 '--mkkey',
788 '-i', id_,
789 '--monmap', monmap_path,
790 ],
791 )
792
793 log.info('Reading keys from all nodes...')
794 keys_fp = StringIO()
795 keys = []
796 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
797 for type_ in ['mgr', 'mds', 'osd']:
798 if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
799 continue
800 for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
801 _, _, id_ = teuthology.split_role(role)
802 data = teuthology.get_file(
803 remote=remote,
804 path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
805 type=type_,
806 id=id_,
807 cluster=cluster_name,
808 ),
809 sudo=True,
810 )
811 keys.append((type_, id_, data))
812 keys_fp.write(data)
813 for remote, roles_for_host in ctx.cluster.remotes.iteritems():
814 for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
815 _, _, id_ = teuthology.split_role(role)
816 data = teuthology.get_file(
817 remote=remote,
818 path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
819 )
820 keys.append(('client', id_, data))
821 keys_fp.write(data)
822
823 log.info('Adding keys to all mons...')
824 writes = mons.run(
825 args=[
826 'sudo', 'tee', '-a',
827 keyring_path,
828 ],
829 stdin=run.PIPE,
830 wait=False,
831 stdout=StringIO(),
832 )
833 keys_fp.seek(0)
834 teuthology.feed_many_stdins_and_close(keys_fp, writes)
835 run.wait(writes)
836 for type_, id_, data in keys:
837 run.wait(
838 mons.run(
839 args=[
840 'sudo',
841 'adjust-ulimits',
842 'ceph-coverage',
843 coverage_dir,
844 'ceph-authtool',
845 keyring_path,
846 '--name={type}.{id}'.format(
847 type=type_,
848 id=id_,
849 ),
850 ] + list(generate_caps(type_)),
851 wait=False,
852 ),
853 )
854
855 log.info('Running mkfs on mon nodes...')
856 for remote, roles_for_host in mons.remotes.iteritems():
857 for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
858 _, _, id_ = teuthology.split_role(role)
859 remote.run(
860 args=[
861 'sudo',
862 'mkdir',
863 '-p',
864 '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
865 ],
866 )
867 remote.run(
868 args=[
869 'sudo',
870 'adjust-ulimits',
871 'ceph-coverage',
872 coverage_dir,
873 'ceph-mon',
874 '--cluster', cluster_name,
875 '--mkfs',
876 '-i', id_,
877 '--monmap', monmap_path,
7c673cae
FG
878 '--keyring', keyring_path,
879 ],
880 )
881
882 run.wait(
883 mons.run(
884 args=[
885 'rm',
886 '--',
887 monmap_path,
7c673cae
FG
888 ],
889 wait=False,
890 ),
891 )
892
893 try:
894 yield
895 except Exception:
896 # we need to know this below
897 ctx.summary['success'] = False
898 raise
899 finally:
900 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
901
902 log.info('Checking cluster log for badness...')
903
904 def first_in_ceph_log(pattern, excludes):
905 """
906 Find the first occurence of the pattern specified in the Ceph log,
907 Returns None if none found.
908
909 :param pattern: Pattern scanned for.
910 :param excludes: Patterns to ignore.
911 :return: First line of text (or None if not found)
912 """
913 args = [
914 'sudo',
915 'egrep', pattern,
916 '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
917 ]
918 for exclude in excludes:
919 args.extend([run.Raw('|'), 'egrep', '-v', exclude])
920 args.extend([
921 run.Raw('|'), 'head', '-n', '1',
922 ])
923 r = mon0_remote.run(
924 stdout=StringIO(),
925 args=args,
926 )
927 stdout = r.stdout.getvalue()
928 if stdout != '':
929 return stdout
930 return None
931
932 if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
933 config['log_whitelist']) is not None:
934 log.warning('Found errors (ERR|WRN|SEC) in cluster log')
935 ctx.summary['success'] = False
936 # use the most severe problem as the failure reason
937 if 'failure_reason' not in ctx.summary:
938 for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
939 match = first_in_ceph_log(pattern, config['log_whitelist'])
940 if match is not None:
941 ctx.summary['failure_reason'] = \
942 '"{match}" in cluster log'.format(
943 match=match.rstrip('\n'),
944 )
945 break
946
947 for remote, dirs in devs_to_clean.iteritems():
948 for dir_ in dirs:
949 log.info('Unmounting %s on %s' % (dir_, remote))
950 try:
951 remote.run(
952 args=[
953 'sync',
954 run.Raw('&&'),
955 'sudo',
956 'umount',
957 '-f',
958 dir_
959 ]
960 )
961 except Exception as e:
962 remote.run(args=[
963 'sudo',
964 run.Raw('PATH=/usr/sbin:$PATH'),
965 'lsof',
966 run.Raw(';'),
967 'ps', 'auxf',
968 ])
969 raise e
970
971 if config.get('tmpfs_journal'):
972 log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
973 for remote, roles_for_host in osds.remotes.iteritems():
974 remote.run(
975 args=['sudo', 'umount', '-f', '/mnt'],
976 check_status=False,
977 )
978
979 if ctx.archive is not None and \
980 not (ctx.config.get('archive-on-error') and ctx.summary['success']):
981
982 # archive mon data, too
983 log.info('Archiving mon data...')
984 path = os.path.join(ctx.archive, 'data')
985 try:
986 os.makedirs(path)
987 except OSError as e:
988 if e.errno == errno.EEXIST:
989 pass
990 else:
991 raise
992 for remote, roles in mons.remotes.iteritems():
993 for role in roles:
994 is_mon = teuthology.is_type('mon', cluster_name)
995 if is_mon(role):
996 _, _, id_ = teuthology.split_role(role)
997 mon_dir = '/var/lib/ceph/mon/' + \
998 '{0}-{1}'.format(cluster_name, id_)
999 teuthology.pull_directory_tarball(
1000 remote,
1001 mon_dir,
1002 path + '/' + role + '.tgz')
1003
1004 log.info('Cleaning ceph cluster...')
1005 run.wait(
1006 ctx.cluster.run(
1007 args=[
1008 'sudo',
1009 'rm',
1010 '-rf',
1011 '--',
1012 conf_path,
1013 keyring_path,
1014 data_dir,
1015 monmap_path,
7c673cae
FG
1016 run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
1017 ],
1018 wait=False,
1019 ),
1020 )
1021
1022
1023def osd_scrub_pgs(ctx, config):
1024 """
1025 Scrub pgs when we exit.
1026
1027 First make sure all pgs are active and clean.
1028 Next scrub all osds.
1029 Then periodically check until all pgs have scrub time stamps that
1030 indicate the last scrub completed. Time out if no progess is made
1031 here after two minutes.
1032 """
d2e6a577
FG
1033 retries = 40
1034 delays = 20
7c673cae
FG
1035 cluster_name = config['cluster']
1036 manager = ctx.managers[cluster_name]
1037 all_clean = False
1038 for _ in range(0, retries):
1039 stats = manager.get_pg_stats()
31f18b77
FG
1040 bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
1041 if not bad:
7c673cae
FG
1042 all_clean = True
1043 break
31f18b77 1044 log.info(
224ce89b 1045 "Waiting for all PGs to be active and clean, waiting on %s" % bad)
7c673cae
FG
1046 time.sleep(delays)
1047 if not all_clean:
31f18b77 1048 raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
7c673cae
FG
1049 check_time_now = time.localtime()
1050 time.sleep(1)
1051 all_roles = teuthology.all_roles(ctx.cluster)
1052 for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
1053 log.info("Scrubbing {osd}".format(osd=role))
1054 _, _, id_ = teuthology.split_role(role)
31f18b77
FG
1055 # allow this to fail; in certain cases the OSD might not be up
1056 # at this point. we will catch all pgs below.
1057 try:
1058 manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
1059 except run.CommandFailedError:
1060 pass
7c673cae
FG
1061 prev_good = 0
1062 gap_cnt = 0
1063 loop = True
1064 while loop:
1065 stats = manager.get_pg_stats()
1066 timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
1067 loop = False
1068 thiscnt = 0
1069 for (pgid, tmval) in timez:
1070 pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
1071 if pgtm > check_time_now:
1072 thiscnt += 1
1073 else:
1074 log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
1075 loop = True
1076 if thiscnt > prev_good:
1077 prev_good = thiscnt
1078 gap_cnt = 0
1079 else:
1080 gap_cnt += 1
31f18b77
FG
1081 if gap_cnt % 6 == 0:
1082 for (pgid, tmval) in timez:
1083 # re-request scrub every so often in case the earlier
1084 # request was missed. do not do it everytime because
1085 # the scrub may be in progress or not reported yet and
1086 # we will starve progress.
1087 manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
7c673cae 1088 if gap_cnt > retries:
31f18b77 1089 raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
7c673cae
FG
1090 if loop:
1091 log.info('Still waiting for all pgs to be scrubbed.')
1092 time.sleep(delays)
1093
1094
1095@contextlib.contextmanager
1096def run_daemon(ctx, config, type_):
1097 """
1098 Run daemons for a role type. Handle the startup and termination of a a daemon.
1099 On startup -- set coverages, cpu_profile, valgrind values for all remotes,
1100 and a max_mds value for one mds.
1101 On cleanup -- Stop all existing daemons of this type.
1102
1103 :param ctx: Context
1104 :param config: Configuration
1105 :paran type_: Role type
1106 """
1107 cluster_name = config['cluster']
1108 log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
1109 testdir = teuthology.get_testdir(ctx)
1110 daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
1111
1112 # check whether any daemons if this type are configured
1113 if daemons is None:
1114 return
1115 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1116
1117 daemon_signal = 'kill'
1118 if config.get('coverage') or config.get('valgrind') is not None:
1119 daemon_signal = 'term'
1120
c07f9fc5
FG
1121 # create osds in order. (this only matters for pre-luminous, which might
1122 # be hammer, which doesn't take an id_ argument to legacy 'osd create').
1123 osd_uuids = {}
7c673cae
FG
1124 for remote, roles_for_host in daemons.remotes.iteritems():
1125 is_type_ = teuthology.is_type(type_, cluster_name)
1126 for role in roles_for_host:
1127 if not is_type_(role):
1128 continue
1129 _, _, id_ = teuthology.split_role(role)
1130
c07f9fc5 1131
224ce89b
WB
1132 if type_ == 'osd':
1133 datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
1134 cluster=cluster_name, id=id_)
1135 osd_uuid = teuthology.get_file(
1136 remote=remote,
1137 path=datadir + '/fsid',
1138 sudo=True,
1139 ).strip()
c07f9fc5
FG
1140 osd_uuids[id_] = osd_uuid
1141 for osd_id in range(len(osd_uuids)):
1142 id_ = str(osd_id)
1143 osd_uuid = osd_uuids.get(id_)
1144 try:
1145 remote.run(
1146 args=[
1147 'sudo', 'ceph', '--cluster', cluster_name,
1148 'osd', 'new', osd_uuid, id_,
1149 ]
1150 )
1151 except:
1152 # fallback to pre-luminous (hammer or jewel)
1153 remote.run(
1154 args=[
1155 'sudo', 'ceph', '--cluster', cluster_name,
1156 'osd', 'create', osd_uuid,
1157 ]
1158 )
1159 if config.get('add_osds_to_crush'):
1160 remote.run(
1161 args=[
1162 'sudo', 'ceph', '--cluster', cluster_name,
1163 'osd', 'crush', 'create-or-move', 'osd.' + id_,
1164 '1.0', 'host=localhost', 'root=default',
1165 ]
1166 )
1167
1168 for remote, roles_for_host in daemons.remotes.iteritems():
1169 is_type_ = teuthology.is_type(type_, cluster_name)
1170 for role in roles_for_host:
1171 if not is_type_(role):
1172 continue
1173 _, _, id_ = teuthology.split_role(role)
224ce89b 1174
7c673cae
FG
1175 run_cmd = [
1176 'sudo',
1177 'adjust-ulimits',
1178 'ceph-coverage',
1179 coverage_dir,
1180 'daemon-helper',
1181 daemon_signal,
1182 ]
1183 run_cmd_tail = [
1184 'ceph-%s' % (type_),
1185 '-f',
1186 '--cluster', cluster_name,
1187 '-i', id_]
1188
1189 if type_ in config.get('cpu_profile', []):
1190 profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
1191 run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
1192
1193 if config.get('valgrind') is not None:
1194 valgrind_args = None
1195 if type_ in config['valgrind']:
1196 valgrind_args = config['valgrind'][type_]
1197 if role in config['valgrind']:
1198 valgrind_args = config['valgrind'][role]
1199 run_cmd = teuthology.get_valgrind_args(testdir, role,
1200 run_cmd,
1201 valgrind_args)
1202
1203 run_cmd.extend(run_cmd_tail)
1204
1205 # always register mgr; don't necessarily start
1206 ctx.daemons.register_daemon(
1207 remote, type_, id_,
1208 cluster=cluster_name,
1209 args=run_cmd,
1210 logger=log.getChild(role),
1211 stdin=run.PIPE,
1212 wait=False
1213 )
1214 if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
1215 role = cluster_name + '.' + type_
1216 ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
1217
1218 try:
1219 yield
1220 finally:
1221 teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
1222
1223
1224def healthy(ctx, config):
1225 """
1226 Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
1227
1228 :param ctx: Context
1229 :param config: Configuration
1230 """
1231 config = config if isinstance(config, dict) else dict()
1232 cluster_name = config.get('cluster', 'ceph')
c07f9fc5
FG
1233 log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
1234 manager = ctx.managers[cluster_name]
1235 try:
d2e6a577
FG
1236 manager.wait_for_mgr_available(timeout=30)
1237 except (run.CommandFailedError, AssertionError) as e:
1238 log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
c07f9fc5 1239
7c673cae
FG
1240 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1241 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1242 teuthology.wait_until_osds_up(
1243 ctx,
1244 cluster=ctx.cluster,
1245 remote=mon0_remote,
1246 ceph_cluster=cluster_name,
1247 )
c07f9fc5
FG
1248
1249 try:
1250 manager.flush_all_pg_stats()
d2e6a577
FG
1251 except (run.CommandFailedError, Exception) as e:
1252 log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
c07f9fc5
FG
1253 manager.wait_for_clean()
1254
1255 log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
7c673cae
FG
1256 teuthology.wait_until_healthy(
1257 ctx,
1258 remote=mon0_remote,
1259 ceph_cluster=cluster_name,
1260 )
1261
1262 if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
1263 # Some MDSs exist, wait for them to be healthy
1264 ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
1265 ceph_fs.wait_for_daemons(timeout=300)
1266
1267
1268def wait_for_osds_up(ctx, config):
1269 """
1270 Wait for all osd's to come up.
1271
1272 :param ctx: Context
1273 :param config: Configuration
1274 """
1275 log.info('Waiting until ceph osds are all up...')
1276 cluster_name = config.get('cluster', 'ceph')
1277 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1278 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1279 teuthology.wait_until_osds_up(
1280 ctx,
1281 cluster=ctx.cluster,
1282 remote=mon0_remote
1283 )
1284
1285
1286def wait_for_mon_quorum(ctx, config):
1287 """
1288 Check renote ceph status until all monitors are up.
1289
1290 :param ctx: Context
1291 :param config: Configuration
1292 """
1293 if isinstance(config, dict):
1294 mons = config['daemons']
1295 cluster_name = config.get('cluster', 'ceph')
1296 else:
1297 assert isinstance(config, list)
1298 mons = config
1299 cluster_name = 'ceph'
1300 firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
1301 (remote,) = ctx.cluster.only(firstmon).remotes.keys()
1302 with contextutil.safe_while(sleep=10, tries=60,
1303 action='wait for monitor quorum') as proceed:
1304 while proceed():
1305 r = remote.run(
1306 args=[
1307 'sudo',
1308 'ceph',
1309 'quorum_status',
1310 ],
1311 stdout=StringIO(),
1312 logger=log.getChild('quorum_status'),
1313 )
1314 j = json.loads(r.stdout.getvalue())
1315 q = j.get('quorum_names', [])
1316 log.debug('Quorum: %s', q)
1317 if sorted(q) == sorted(mons):
1318 break
1319
1320
1321def created_pool(ctx, config):
1322 """
1323 Add new pools to the dictionary of pools that the ceph-manager
1324 knows about.
1325 """
1326 for new_pool in config:
1327 if new_pool not in ctx.managers['ceph'].pools:
1328 ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
1329 new_pool, 'pg_num')
1330
1331
1332@contextlib.contextmanager
1333def restart(ctx, config):
1334 """
1335 restart ceph daemons
1336
1337 For example::
1338 tasks:
1339 - ceph.restart: [all]
1340
1341 For example::
1342 tasks:
1343 - ceph.restart: [osd.0, mon.1, mds.*]
1344
1345 or::
1346
1347 tasks:
1348 - ceph.restart:
1349 daemons: [osd.0, mon.1]
1350 wait-for-healthy: false
1351 wait-for-osds-up: true
1352
1353 :param ctx: Context
1354 :param config: Configuration
1355 """
1356 if config is None:
1357 config = {}
1358 elif isinstance(config, list):
1359 config = {'daemons': config}
1360
1361 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1362 clusters = set()
1363 for role in daemons:
1364 cluster, type_, id_ = teuthology.split_role(role)
1365 ctx.daemons.get_daemon(type_, id_, cluster).restart()
1366 clusters.add(cluster)
1367
1368 manager = ctx.managers['ceph']
1369 for dmon in daemons:
1370 if '.' in dmon:
1371 dm_parts = dmon.split('.')
1372 if dm_parts[1].isdigit():
1373 if dm_parts[0] == 'osd':
1374 manager.mark_down_osd(int(dm_parts[1]))
1375
1376 if config.get('wait-for-healthy', True):
1377 for cluster in clusters:
1378 healthy(ctx=ctx, config=dict(cluster=cluster))
1379 if config.get('wait-for-osds-up', False):
1380 for cluster in clusters:
1381 wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
1382 yield
1383
1384
1385@contextlib.contextmanager
1386def stop(ctx, config):
1387 """
1388 Stop ceph daemons
1389
1390 For example::
1391 tasks:
1392 - ceph.stop: [mds.*]
1393
1394 tasks:
1395 - ceph.stop: [osd.0, osd.2]
1396
1397 tasks:
1398 - ceph.stop:
1399 daemons: [osd.0, osd.2]
1400
1401 """
1402 if config is None:
1403 config = {}
1404 elif isinstance(config, list):
1405 config = {'daemons': config}
1406
1407 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1408 for role in daemons:
1409 cluster, type_, id_ = teuthology.split_role(role)
1410 ctx.daemons.get_daemon(type_, id_, cluster).stop()
1411
1412 yield
1413
1414
1415@contextlib.contextmanager
1416def wait_for_failure(ctx, config):
1417 """
1418 Wait for a failure of a ceph daemon
1419
1420 For example::
1421 tasks:
1422 - ceph.wait_for_failure: [mds.*]
1423
1424 tasks:
1425 - ceph.wait_for_failure: [osd.0, osd.2]
1426
1427 tasks:
1428 - ceph.wait_for_failure:
1429 daemons: [osd.0, osd.2]
1430
1431 """
1432 if config is None:
1433 config = {}
1434 elif isinstance(config, list):
1435 config = {'daemons': config}
1436
1437 daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
1438 for role in daemons:
1439 cluster, type_, id_ = teuthology.split_role(role)
1440 try:
1441 ctx.daemons.get_daemon(type_, id_, cluster).wait()
1442 except:
1443 log.info('Saw expected daemon failure. Continuing.')
1444 pass
1445 else:
1446 raise RuntimeError('daemon %s did not fail' % role)
1447
1448 yield
1449
1450
1451def validate_config(ctx, config):
1452 """
1453 Perform some simple validation on task configuration.
1454 Raises exceptions.ConfigError if an error is found.
1455 """
1456 # check for osds from multiple clusters on the same host
1457 for remote, roles_for_host in ctx.cluster.remotes.items():
1458 last_cluster = None
1459 last_role = None
1460 for role in roles_for_host:
1461 role_cluster, role_type, _ = teuthology.split_role(role)
1462 if role_type != 'osd':
1463 continue
1464 if last_cluster and last_cluster != role_cluster:
1465 msg = "Host should not have osds (%s and %s) from multiple clusters" % (
1466 last_role, role)
1467 raise exceptions.ConfigError(msg)
1468 last_cluster = role_cluster
1469 last_role = role
1470
1471
1472@contextlib.contextmanager
1473def task(ctx, config):
1474 """
1475 Set up and tear down a Ceph cluster.
1476
1477 For example::
1478
1479 tasks:
1480 - ceph:
1481 - interactive:
1482
1483 You can also specify what branch to run::
1484
1485 tasks:
1486 - ceph:
1487 branch: foo
1488
1489 Or a tag::
1490
1491 tasks:
1492 - ceph:
1493 tag: v0.42.13
1494
1495 Or a sha1::
1496
1497 tasks:
1498 - ceph:
1499 sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
1500
1501 Or a local source dir::
1502
1503 tasks:
1504 - ceph:
1505 path: /home/sage/ceph
1506
1507 To capture code coverage data, use::
1508
1509 tasks:
1510 - ceph:
1511 coverage: true
1512
1513 To use btrfs, ext4, or xfs on the target's scratch disks, use::
1514
1515 tasks:
1516 - ceph:
1517 fs: xfs
1518 mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
1519 mount_options: [nobarrier, inode64]
1520
1521 Note, this will cause the task to check the /scratch_devs file on each node
1522 for available devices. If no such file is found, /dev/sdb will be used.
1523
1524 To run some daemons under valgrind, include their names
1525 and the tool/args to use in a valgrind section::
1526
1527 tasks:
1528 - ceph:
1529 valgrind:
1530 mds.1: --tool=memcheck
1531 osd.1: [--tool=memcheck, --leak-check=no]
1532
1533 Those nodes which are using memcheck or valgrind will get
1534 checked for bad results.
1535
1536 To adjust or modify config options, use::
1537
1538 tasks:
1539 - ceph:
1540 conf:
1541 section:
1542 key: value
1543
1544 For example::
1545
1546 tasks:
1547 - ceph:
1548 conf:
1549 mds.0:
1550 some option: value
1551 other key: other value
1552 client.0:
1553 debug client: 10
1554 debug ms: 1
1555
1556 By default, the cluster log is checked for errors and warnings,
1557 and the run marked failed if any appear. You can ignore log
1558 entries by giving a list of egrep compatible regexes, i.e.:
1559
1560 tasks:
1561 - ceph:
1562 log-whitelist: ['foo.*bar', 'bad message']
1563
1564 To run multiple ceph clusters, use multiple ceph tasks, and roles
1565 with a cluster name prefix, e.g. cluster1.client.0. Roles with no
1566 cluster use the default cluster name, 'ceph'. OSDs from separate
1567 clusters must be on separate hosts. Clients and non-osd daemons
1568 from multiple clusters may be colocated. For each cluster, add an
1569 instance of the ceph task with the cluster name specified, e.g.::
1570
1571 roles:
1572 - [mon.a, osd.0, osd.1]
1573 - [backup.mon.a, backup.osd.0, backup.osd.1]
1574 - [client.0, backup.client.0]
1575 tasks:
1576 - ceph:
1577 cluster: ceph
1578 - ceph:
1579 cluster: backup
1580
1581 :param ctx: Context
1582 :param config: Configuration
1583
1584 """
1585 if config is None:
1586 config = {}
1587 assert isinstance(config, dict), \
1588 "task ceph only supports a dictionary for configuration"
1589
1590 overrides = ctx.config.get('overrides', {})
1591 teuthology.deep_merge(config, overrides.get('ceph', {}))
1592
1593 first_ceph_cluster = False
1594 if not hasattr(ctx, 'daemons'):
1595 first_ceph_cluster = True
1596 ctx.daemons = DaemonGroup()
1597
1598 testdir = teuthology.get_testdir(ctx)
1599 if config.get('coverage'):
1600 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
1601 log.info('Creating coverage directory...')
1602 run.wait(
1603 ctx.cluster.run(
1604 args=[
1605 'install', '-d', '-m0755', '--',
1606 coverage_dir,
1607 ],
1608 wait=False,
1609 )
1610 )
1611
1612 if 'cluster' not in config:
1613 config['cluster'] = 'ceph'
1614
1615 validate_config(ctx, config)
1616
1617 subtasks = []
1618 if first_ceph_cluster:
1619 # these tasks handle general log setup and parsing on all hosts,
1620 # so they should only be run once
1621 subtasks = [
1622 lambda: ceph_log(ctx=ctx, config=None),
1623 lambda: valgrind_post(ctx=ctx, config=config),
1624 ]
1625
1626 subtasks += [
1627 lambda: cluster(ctx=ctx, config=dict(
1628 conf=config.get('conf', {}),
1629 fs=config.get('fs', 'xfs'),
1630 mkfs_options=config.get('mkfs_options', None),
1631 mount_options=config.get('mount_options', None),
1632 block_journal=config.get('block_journal', None),
1633 tmpfs_journal=config.get('tmpfs_journal', None),
1634 skip_mgr_daemons=config.get('skip_mgr_daemons', False),
1635 log_whitelist=config.get('log-whitelist', []),
1636 cpu_profile=set(config.get('cpu_profile', []),),
1637 cluster=config['cluster'],
1638 )),
1639 lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
1640 lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
1641 lambda: crush_setup(ctx=ctx, config=config),
1642 lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
224ce89b 1643 lambda: create_rbd_pool(ctx=ctx, config=config),
7c673cae
FG
1644 lambda: cephfs_setup(ctx=ctx, config=config),
1645 lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
1646 ]
1647
1648 with contextutil.nested(*subtasks):
1649 first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
1650 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
1651 if not hasattr(ctx, 'managers'):
1652 ctx.managers = {}
1653 ctx.managers[config['cluster']] = CephManager(
1654 mon,
1655 ctx=ctx,
1656 logger=log.getChild('ceph_manager.' + config['cluster']),
1657 cluster=config['cluster'],
1658 )
1659
1660 try:
1661 if config.get('wait-for-healthy', True):
1662 healthy(ctx=ctx, config=dict(cluster=config['cluster']))
1663
1664 yield
1665 finally:
1666 if config.get('wait-for-scrub', True):
1667 osd_scrub_pgs(ctx, config)
224ce89b
WB
1668
1669 # stop logging health to clog during shutdown, or else we generate
1670 # a bunch of scary messages unrelated to our actual run.
1671 firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
1672 (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
1673 mon0_remote.run(
1674 args=[
1675 'sudo',
1676 'ceph',
1677 '--cluster', config['cluster'],
1678 'tell',
1679 'mon.*',
1680 'injectargs',
1681 '--',
1682 '--no-mon-health-to-clog',
1683 ]
1684 )