]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/rebuild_mondb.py
import 15.2.4
[ceph.git] / ceph / qa / tasks / rebuild_mondb.py
1 """
2 Test if we can recover the leveldb from OSD after where all leveldbs are
3 corrupted
4 """
5
6 import logging
7 import os.path
8 import shutil
9 import tempfile
10
11 from tasks import ceph_manager
12 from teuthology import misc as teuthology
13
14 log = logging.getLogger(__name__)
15
16
17 def _push_directory(path, remote, remote_dir):
18 """
19 local_temp_path=`mktemp`
20 tar czf $local_temp_path $path
21 ssh remote mkdir -p remote_dir
22 remote_temp_path=`mktemp`
23 scp $local_temp_path $remote_temp_path
24 rm $local_temp_path
25 tar xzf $remote_temp_path -C $remote_dir
26 ssh remote:$remote_temp_path
27 """
28 fd, local_temp_path = tempfile.mkstemp(suffix='.tgz',
29 prefix='rebuild_mondb-')
30 os.close(fd)
31 cmd = ' '.join(['tar', 'cz',
32 '-f', local_temp_path,
33 '-C', path,
34 '--', '.'])
35 teuthology.sh(cmd)
36 _, fname = os.path.split(local_temp_path)
37 fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz',
38 prefix='rebuild_mondb-')
39 os.close(fd)
40 remote.put_file(local_temp_path, remote_temp_path)
41 os.remove(local_temp_path)
42 remote.run(args=['sudo',
43 'tar', 'xz',
44 '-C', remote_dir,
45 '-f', remote_temp_path])
46 remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])
47
48
49 def _nuke_mons(manager, mons, mon_id):
50 assert mons
51 is_mon = teuthology.is_type('mon')
52 for remote, roles in mons.remotes.items():
53 for role in roles:
54 if not is_mon(role):
55 continue
56 cluster, _, m = teuthology.split_role(role)
57 log.info('killing {cluster}:mon.{mon}'.format(
58 cluster=cluster,
59 mon=m))
60 manager.kill_mon(m)
61 mon_data = os.path.join('/var/lib/ceph/mon/',
62 '{0}-{1}'.format(cluster, m))
63 if m == mon_id:
64 # so we will only need to recreate the store.db for the
65 # first mon, would be easier than mkfs on it then replace
66 # the its store.db with the recovered one
67 store_dir = os.path.join(mon_data, 'store.db')
68 remote.run(args=['sudo', 'rm', '-r', store_dir])
69 else:
70 remote.run(args=['sudo', 'rm', '-r', mon_data])
71
72
73 def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path):
74 local_mstore = tempfile.mkdtemp()
75
76 # collect the maps from all OSDs
77 is_osd = teuthology.is_type('osd')
78 osds = ctx.cluster.only(is_osd)
79 assert osds
80 for osd, roles in osds.remotes.items():
81 for role in roles:
82 if not is_osd(role):
83 continue
84 cluster, _, osd_id = teuthology.split_role(role)
85 assert cluster_name == cluster
86 log.info('collecting maps from {cluster}:osd.{osd}'.format(
87 cluster=cluster,
88 osd=osd_id))
89 # push leveldb to OSD
90 osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
91 osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])
92
93 _push_directory(local_mstore, osd, osd_mstore)
94 log.info('rm -rf {0}'.format(local_mstore))
95 shutil.rmtree(local_mstore)
96 # update leveldb with OSD data
97 options = '--no-mon-config --op update-mon-db --mon-store-path {0}'
98 log.info('cot {0}'.format(osd_mstore))
99 manager.objectstore_tool(pool=None,
100 options=options.format(osd_mstore),
101 args='',
102 osd=osd_id,
103 do_revive=False)
104 # pull the updated mon db
105 log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore))
106 local_mstore = tempfile.mkdtemp()
107 teuthology.pull_directory(osd, osd_mstore, local_mstore)
108 log.info('rm -rf osd:{0}'.format(osd_mstore))
109 osd.run(args=['sudo', 'rm', '-fr', osd_mstore])
110
111 # recover the first_mon with re-built mon db
112 # pull from recovered leveldb from client
113 mon_store_dir = os.path.join('/var/lib/ceph/mon',
114 '{0}-{1}'.format(cluster_name, mon_id))
115 _push_directory(local_mstore, mon, mon_store_dir)
116 mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
117 shutil.rmtree(local_mstore)
118
119 # fill up the caps in the keyring file
120 mon.run(args=['sudo',
121 'ceph-authtool', keyring_path,
122 '-n', 'mon.',
123 '--cap', 'mon', 'allow *'])
124 mon.run(args=['sudo',
125 'ceph-authtool', keyring_path,
126 '-n', 'client.admin',
127 '--cap', 'mon', 'allow *',
128 '--cap', 'osd', 'allow *',
129 '--cap', 'mds', 'allow *',
130 '--cap', 'mgr', 'allow *'])
131 mon.run(args=['sudo', '-u', 'ceph',
132 'CEPH_ARGS=--no-mon-config',
133 'ceph-monstore-tool', mon_store_dir,
134 'rebuild', '--',
135 '--keyring', keyring_path,
136 '--monmap', '/tmp/monmap',
137 ])
138
139
140 def _revive_mons(manager, mons, recovered, keyring_path):
141 # revive monitors
142 # the initial monmap is in the ceph.conf, so we are good.
143 n_mons = 0
144 is_mon = teuthology.is_type('mon')
145 for remote, roles in mons.remotes.items():
146 for role in roles:
147 if not is_mon(role):
148 continue
149 cluster, _, m = teuthology.split_role(role)
150 if recovered != m:
151 log.info('running mkfs on {cluster}:mon.{mon}'.format(
152 cluster=cluster,
153 mon=m))
154 remote.run(
155 args=[
156 'sudo',
157 'ceph-mon',
158 '--cluster', cluster,
159 '--mkfs',
160 '-i', m,
161 '--keyring', keyring_path,
162 '--monmap', '/tmp/monmap'])
163 log.info('reviving mon.{0}'.format(m))
164 manager.revive_mon(m)
165 n_mons += 1
166 manager.wait_for_mon_quorum_size(n_mons, timeout=30)
167
168
169 def _revive_mgrs(ctx, manager):
170 is_mgr = teuthology.is_type('mgr')
171 mgrs = ctx.cluster.only(is_mgr)
172 for _, roles in mgrs.remotes.items():
173 for role in roles:
174 if not is_mgr(role):
175 continue
176 _, _, mgr_id = teuthology.split_role(role)
177 log.info('reviving mgr.{0}'.format(mgr_id))
178 manager.revive_mgr(mgr_id)
179
180
181 def _revive_osds(ctx, manager):
182 is_osd = teuthology.is_type('osd')
183 osds = ctx.cluster.only(is_osd)
184 for _, roles in osds.remotes.items():
185 for role in roles:
186 if not is_osd(role):
187 continue
188 _, _, osd_id = teuthology.split_role(role)
189 log.info('reviving osd.{0}'.format(osd_id))
190 manager.revive_osd(osd_id)
191
192
193 def task(ctx, config):
194 """
195 Test monitor recovery from OSD
196 """
197 if config is None:
198 config = {}
199 assert isinstance(config, dict), \
200 'task only accepts a dict for configuration'
201
202 first_mon = teuthology.get_first_mon(ctx, config)
203 (mon,) = ctx.cluster.only(first_mon).remotes.keys()
204
205 # stash a monmap for later
206 mon.run(args=['ceph', 'mon', 'getmap', '-o', '/tmp/monmap'])
207
208 manager = ceph_manager.CephManager(
209 mon,
210 ctx=ctx,
211 logger=log.getChild('ceph_manager'))
212
213 mons = ctx.cluster.only(teuthology.is_type('mon'))
214 # note down the first cluster_name and mon_id
215 # we will recover it later on
216 cluster_name, _, mon_id = teuthology.split_role(first_mon)
217 _nuke_mons(manager, mons, mon_id)
218 default_keyring = '/etc/ceph/{cluster}.keyring'.format(
219 cluster=cluster_name)
220 keyring_path = config.get('keyring_path', default_keyring)
221 _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path)
222 _revive_mons(manager, mons, mon_id, keyring_path)
223 _revive_mgrs(ctx, manager)
224 _revive_osds(ctx, manager)