]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mon_seesaw.py
bump version to 12.2.12-pve1
[ceph.git] / ceph / qa / tasks / mon_seesaw.py
1 from cStringIO import StringIO
2
3 import contextlib
4 import logging
5 import random
6
7 from teuthology import misc as teuthology
8 from teuthology.orchestra import run
9
10 from ceph_manager import CephManager, write_conf
11
12
13 log = logging.getLogger(__name__)
14
15
16 def _get_mons(ctx):
17 return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
18
19
20 # teuthology prepares the monitor IPs (and ports) in get_mons(), we can
21 # enumerate all monitor ports ([6789..]), and find the next available one.
22 def _get_next_port(ctx, ip, cluster):
23 # assuming we have only one cluster here.
24 used = []
25 for name in teuthology.get_mon_names(ctx, cluster):
26 addr = ctx.ceph[cluster].conf[name]['mon addr']
27 mon_ip, mon_port = addr.split(':')
28 if mon_ip != ip:
29 continue
30 used.append(int(mon_port))
31 port = 6789
32 used.sort()
33 for p in used:
34 if p != port:
35 break
36 port += 1
37 return port
38
39
40 def _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path):
41 # co-locate a new monitor on remote where an existing monitor is hosted
42 cluster = manager.cluster
43 remote.run(args=['sudo', 'mkdir', '-p', data_path])
44 keyring_path = '/etc/ceph/{cluster}.keyring'.format(
45 cluster=manager.cluster)
46 testdir = teuthology.get_testdir(ctx)
47 monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
48 cluster=cluster)
49 manager.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path)
50 if manager.controller != remote:
51 monmap = teuthology.get_file(manager.controller, monmap_path)
52 teuthology.write_file(remote, monmap_path, StringIO(monmap))
53 remote.run(
54 args=[
55 'sudo',
56 'ceph-mon',
57 '--cluster', cluster,
58 '--mkfs',
59 '-i', mon,
60 '--monmap', monmap_path,
61 '--keyring', keyring_path])
62 if manager.controller != remote:
63 teuthology.delete_file(remote, monmap_path)
64 # raw_cluster_cmd() is performed using sudo, so sudo here also.
65 teuthology.delete_file(manager.controller, monmap_path, sudo=True)
66 # update ceph.conf so that the ceph CLI is able to connect to the cluster
67 if conf_path:
68 ip = remote.ip_address
69 port = _get_next_port(ctx, ip, cluster)
70 mon_addr = '{ip}:{port}'.format(ip=ip, port=port)
71 ctx.ceph[cluster].conf[name] = {'mon addr': mon_addr}
72 write_conf(ctx, conf_path, cluster)
73
74
75 def _teardown_mon(ctx, manager, remote, name, data_path, conf_path):
76 cluster = manager.cluster
77 del ctx.ceph[cluster].conf[name]
78 write_conf(ctx, conf_path, cluster)
79 remote.run(args=['sudo', 'rm', '-rf', data_path])
80
81
82 @contextlib.contextmanager
83 def _prepare_mon(ctx, manager, remote, mon):
84 cluster = manager.cluster
85 data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
86 cluster=cluster, id=mon)
87 conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster)
88 name = 'mon.{0}'.format(mon)
89 _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path)
90 yield
91 _teardown_mon(ctx, manager, remote, name,
92 data_path, conf_path)
93
94
95 # run_daemon() in ceph.py starts a herd of daemons of the same type, but
96 # _run_daemon() starts only one instance.
97 @contextlib.contextmanager
98 def _run_daemon(ctx, remote, cluster, type_, id_):
99 testdir = teuthology.get_testdir(ctx)
100 coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
101 daemon_signal = 'kill'
102 run_cmd = [
103 'sudo',
104 'adjust-ulimits',
105 'ceph-coverage',
106 coverage_dir,
107 'daemon-helper',
108 daemon_signal,
109 ]
110 run_cmd_tail = [
111 'ceph-%s' % (type_),
112 '-f',
113 '--cluster', cluster,
114 '-i', id_]
115 run_cmd.extend(run_cmd_tail)
116 ctx.daemons.add_daemon(remote, type_, id_,
117 cluster=cluster,
118 args=run_cmd,
119 logger=log.getChild(type_),
120 stdin=run.PIPE,
121 wait=False)
122 daemon = ctx.daemons.get_daemon(type_, id_, cluster)
123 yield daemon
124 daemon.stop()
125
126
127 @contextlib.contextmanager
128 def task(ctx, config):
129 """
130 replace a monitor with a newly added one, and then revert this change
131
132 How it works::
133 1. add a mon with specified id (mon.victim_prime)
134 2. wait for quorum
135 3. remove a monitor with specified id (mon.victim), mon.victim will commit
136 suicide
137 4. wait for quorum
138 5. <yield>
139 5. add mon.a back, and start it
140 6. wait for quorum
141 7. remove mon.a_prime
142
143 Options::
144 victim the id of the mon to be removed (pick a random mon by default)
145 replacer the id of the new mon (use "${victim}_prime" if not specified)
146 """
147 first_mon = teuthology.get_first_mon(ctx, config)
148 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
149 manager = CephManager(mon, ctx=ctx, logger=log.getChild('ceph_manager'))
150
151 if config is None:
152 config = {}
153 assert isinstance(config, dict), \
154 "task ceph only supports a dictionary for configuration"
155 overrides = ctx.config.get('overrides', {})
156 teuthology.deep_merge(config, overrides.get('mon_seesaw', {}))
157 victim = config.get('victim', random.choice(_get_mons(ctx)))
158 replacer = config.get('replacer', '{0}_prime'.format(victim))
159 remote = manager.find_remote('mon', victim)
160 quorum = manager.get_mon_quorum()
161 cluster = manager.cluster
162 log.info('replacing {victim} with {replacer}'.format(victim=victim,
163 replacer=replacer))
164 with _prepare_mon(ctx, manager, remote, replacer):
165 with _run_daemon(ctx, remote, cluster, 'mon', replacer):
166 # replacer will join the quorum automatically
167 manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
168 # if we don't remove the victim from monmap, there is chance that
169 # we are leaving the new joiner with a monmap of 2 mon, and it will
170 # not able to reach the other one, it will be keeping probing for
171 # ever.
172 log.info('removing {mon}'.format(mon=victim))
173 manager.raw_cluster_cmd('mon', 'remove', victim)
174 manager.wait_for_mon_quorum_size(len(quorum), 10)
175 # the victim will commit suicide after being removed from
176 # monmap, let's wait until it stops.
177 ctx.daemons.get_daemon('mon', victim, cluster).wait(10)
178 try:
179 # perform other tasks
180 yield
181 finally:
182 # bring the victim back online
183 # nuke the monstore of victim, otherwise it will refuse to boot
184 # with following message:
185 #
186 # not in monmap and have been in a quorum before; must have
187 # been removed
188 log.info('re-adding {mon}'.format(mon=victim))
189 data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
190 cluster=cluster, id=victim)
191 remote.run(args=['sudo', 'rm', '-rf', data_path])
192 name = 'mon.{0}'.format(victim)
193 _setup_mon(ctx, manager, remote, victim, name, data_path, None)
194 log.info('reviving {mon}'.format(mon=victim))
195 manager.revive_mon(victim)
196 manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
197 manager.raw_cluster_cmd('mon', 'remove', replacer)
198 manager.wait_for_mon_quorum_size(len(quorum), 10)