]>
git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/mon_seesaw.py
1 from cStringIO
import StringIO
7 from teuthology
import misc
as teuthology
8 from teuthology
.orchestra
import run
10 from ceph_manager
import CephManager
, write_conf
13 log
= logging
.getLogger(__name__
)
17 return [name
[len('mon.'):] for name
in teuthology
.get_mon_names(ctx
)]
20 # teuthology prepares the monitor IPs (and ports) in get_mons(), we can
21 # enumerate all monitor ports ([6789..]), and find the next available one.
22 def _get_next_port(ctx
, ip
, cluster
):
23 # assuming we have only one cluster here.
25 for name
in teuthology
.get_mon_names(ctx
, cluster
):
26 addr
= ctx
.ceph
[cluster
].conf
[name
]['mon addr']
27 mon_ip
, mon_port
= addr
.split(':')
30 used
.append(int(mon_port
))
40 def _setup_mon(ctx
, manager
, remote
, mon
, name
, data_path
, conf_path
):
41 # co-locate a new monitor on remote where an existing monitor is hosted
42 cluster
= manager
.cluster
43 remote
.run(args
=['sudo', 'mkdir', '-p', data_path
])
44 keyring_path
= '/etc/ceph/{cluster}.keyring'.format(
45 cluster
=manager
.cluster
)
46 testdir
= teuthology
.get_testdir(ctx
)
47 monmap_path
= '{tdir}/{cluster}.monmap'.format(tdir
=testdir
,
49 manager
.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path
)
50 if manager
.controller
!= remote
:
51 monmap
= teuthology
.get_file(manager
.controller
, monmap_path
)
52 teuthology
.write_file(remote
, monmap_path
, StringIO(monmap
))
60 '--monmap', monmap_path
,
61 '--keyring', keyring_path
])
62 if manager
.controller
!= remote
:
63 teuthology
.delete_file(remote
, monmap_path
)
64 # raw_cluster_cmd() is performed using sudo, so sudo here also.
65 teuthology
.delete_file(manager
.controller
, monmap_path
, sudo
=True)
66 # update ceph.conf so that the ceph CLI is able to connect to the cluster
68 ip
= remote
.ip_address
69 port
= _get_next_port(ctx
, ip
, cluster
)
70 mon_addr
= '{ip}:{port}'.format(ip
=ip
, port
=port
)
71 ctx
.ceph
[cluster
].conf
[name
] = {'mon addr': mon_addr
}
72 write_conf(ctx
, conf_path
, cluster
)
75 def _teardown_mon(ctx
, manager
, remote
, name
, data_path
, conf_path
):
76 cluster
= manager
.cluster
77 del ctx
.ceph
[cluster
].conf
[name
]
78 write_conf(ctx
, conf_path
, cluster
)
79 remote
.run(args
=['sudo', 'rm', '-rf', data_path
])
82 @contextlib.contextmanager
83 def _prepare_mon(ctx
, manager
, remote
, mon
):
84 cluster
= manager
.cluster
85 data_path
= '/var/lib/ceph/mon/{cluster}-{id}'.format(
86 cluster
=cluster
, id=mon
)
87 conf_path
= '/etc/ceph/{cluster}.conf'.format(cluster
=cluster
)
88 name
= 'mon.{0}'.format(mon
)
89 _setup_mon(ctx
, manager
, remote
, mon
, name
, data_path
, conf_path
)
91 _teardown_mon(ctx
, manager
, remote
, name
,
95 # run_daemon() in ceph.py starts a herd of daemons of the same type, but
96 # _run_daemon() starts only one instance.
97 @contextlib.contextmanager
98 def _run_daemon(ctx
, remote
, cluster
, type_
, id_
):
99 testdir
= teuthology
.get_testdir(ctx
)
100 coverage_dir
= '{tdir}/archive/coverage'.format(tdir
=testdir
)
101 daemon_signal
= 'kill'
113 '--cluster', cluster
,
115 run_cmd
.extend(run_cmd_tail
)
116 ctx
.daemons
.add_daemon(remote
, type_
, id_
,
119 logger
=log
.getChild(type_
),
122 daemon
= ctx
.daemons
.get_daemon(type_
, id_
, cluster
)
127 @contextlib.contextmanager
128 def task(ctx
, config
):
130 replace a monitor with a newly added one, and then revert this change
133 1. add a mon with specified id (mon.victim_prime)
135 3. remove a monitor with specified id (mon.victim), mon.victim will commit
139 5. add mon.a back, and start it
141 7. remove mon.a_prime
144 victim the id of the mon to be removed (pick a random mon by default)
145 replacer the id of the new mon (use "${victim}_prime" if not specified)
147 first_mon
= teuthology
.get_first_mon(ctx
, config
)
148 (mon
,) = ctx
.cluster
.only(first_mon
).remotes
.iterkeys()
149 manager
= CephManager(mon
, ctx
=ctx
, logger
=log
.getChild('ceph_manager'))
153 assert isinstance(config
, dict), \
154 "task ceph only supports a dictionary for configuration"
155 overrides
= ctx
.config
.get('overrides', {})
156 teuthology
.deep_merge(config
, overrides
.get('mon_seesaw', {}))
157 victim
= config
.get('victim', random
.choice(_get_mons(ctx
)))
158 replacer
= config
.get('replacer', '{0}_prime'.format(victim
))
159 remote
= manager
.find_remote('mon', victim
)
160 quorum
= manager
.get_mon_quorum()
161 cluster
= manager
.cluster
162 log
.info('replacing {victim} with {replacer}'.format(victim
=victim
,
164 with
_prepare_mon(ctx
, manager
, remote
, replacer
):
165 with
_run_daemon(ctx
, remote
, cluster
, 'mon', replacer
):
166 # replacer will join the quorum automatically
167 manager
.wait_for_mon_quorum_size(len(quorum
) + 1, 10)
168 # if we don't remove the victim from monmap, there is chance that
169 # we are leaving the new joiner with a monmap of 2 mon, and it will
170 # not able to reach the other one, it will be keeping probing for
172 log
.info('removing {mon}'.format(mon
=victim
))
173 manager
.raw_cluster_cmd('mon', 'remove', victim
)
174 manager
.wait_for_mon_quorum_size(len(quorum
), 10)
175 # the victim will commit suicide after being removed from
176 # monmap, let's wait until it stops.
177 ctx
.daemons
.get_daemon('mon', victim
, cluster
).wait(10)
179 # perform other tasks
182 # bring the victim back online
183 # nuke the monstore of victim, otherwise it will refuse to boot
184 # with following message:
186 # not in monmap and have been in a quorum before; must have
188 log
.info('re-adding {mon}'.format(mon
=victim
))
189 data_path
= '/var/lib/ceph/mon/{cluster}-{id}'.format(
190 cluster
=cluster
, id=victim
)
191 remote
.run(args
=['sudo', 'rm', '-rf', data_path
])
192 name
= 'mon.{0}'.format(victim
)
193 _setup_mon(ctx
, manager
, remote
, victim
, name
, data_path
, None)
194 log
.info('reviving {mon}'.format(mon
=victim
))
195 manager
.revive_mon(victim
)
196 manager
.wait_for_mon_quorum_size(len(quorum
) + 1, 10)
197 manager
.raw_cluster_cmd('mon', 'remove', replacer
)
198 manager
.wait_for_mon_quorum_size(len(quorum
), 10)