]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/rebuild_mondb.py
update sources to v12.1.0
[ceph.git] / ceph / qa / tasks / rebuild_mondb.py
CommitLineData
7c673cae
FG
1"""
2Test if we can recover the leveldb from OSD after where all leveldbs are
3corrupted
4"""
5
6import logging
7import os.path
8import shutil
9import tempfile
10
11import ceph_manager
12from teuthology import misc as teuthology
13
14log = logging.getLogger(__name__)
15
16
31f18b77 17def _push_directory(path, remote, remote_dir):
7c673cae
FG
18 """
19 local_temp_path=`mktemp`
20 tar czf $local_temp_path $path
21 ssh remote mkdir -p remote_dir
22 remote_temp_path=`mktemp`
23 scp $local_temp_path $remote_temp_path
24 rm $local_temp_path
25 tar xzf $remote_temp_path -C $remote_dir
26 ssh remote:$remote_temp_path
27 """
28 fd, local_temp_path = tempfile.mkstemp(suffix='.tgz',
29 prefix='rebuild_mondb-')
30 os.close(fd)
31 cmd = ' '.join(['tar', 'cz',
32 '-f', local_temp_path,
33 '-C', path,
34 '--', '.'])
35 teuthology.sh(cmd)
36 _, fname = os.path.split(local_temp_path)
37 fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz',
38 prefix='rebuild_mondb-')
39 os.close(fd)
40 remote.put_file(local_temp_path, remote_temp_path)
41 os.remove(local_temp_path)
42 remote.run(args=['sudo',
43 'tar', 'xz',
44 '-C', remote_dir,
45 '-f', remote_temp_path])
46 remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])
47
48
31f18b77 49def _nuke_mons(manager, mons, mon_id):
7c673cae 50 assert mons
31f18b77 51 is_mon = teuthology.is_type('mon')
7c673cae 52 for remote, roles in mons.remotes.iteritems():
7c673cae
FG
53 for role in roles:
54 if not is_mon(role):
55 continue
56 cluster, _, m = teuthology.split_role(role)
7c673cae
FG
57 log.info('killing {cluster}:mon.{mon}'.format(
58 cluster=cluster,
59 mon=m))
60 manager.kill_mon(m)
61 mon_data = os.path.join('/var/lib/ceph/mon/',
31f18b77 62 '{0}-{1}'.format(cluster, m))
7c673cae
FG
63 if m == mon_id:
64 # so we will only need to recreate the store.db for the
65 # first mon, would be easier than mkfs on it then replace
66 # the its store.db with the recovered one
67 store_dir = os.path.join(mon_data, 'store.db')
68 remote.run(args=['sudo', 'rm', '-r', store_dir])
69 else:
70 remote.run(args=['sudo', 'rm', '-r', mon_data])
71
31f18b77
FG
72
73def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path):
7c673cae
FG
74 local_mstore = tempfile.mkdtemp()
75
76 # collect the maps from all OSDs
31f18b77
FG
77 is_osd = teuthology.is_type('osd')
78 osds = ctx.cluster.only(is_osd)
7c673cae
FG
79 assert osds
80 for osd, roles in osds.remotes.iteritems():
7c673cae
FG
81 for role in roles:
82 if not is_osd(role):
83 continue
84 cluster, _, osd_id = teuthology.split_role(role)
85 assert cluster_name == cluster
86 log.info('collecting maps from {cluster}:osd.{osd}'.format(
87 cluster=cluster,
88 osd=osd_id))
89 # push leveldb to OSD
90 osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
91 osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])
92
31f18b77 93 _push_directory(local_mstore, osd, osd_mstore)
7c673cae
FG
94 log.info('rm -rf {0}'.format(local_mstore))
95 shutil.rmtree(local_mstore)
96 # update leveldb with OSD data
97 options = '--op update-mon-db --mon-store-path {0}'
98 log.info('cot {0}'.format(osd_mstore))
99 manager.objectstore_tool(pool=None,
100 options=options.format(osd_mstore),
101 args='',
102 osd=osd_id,
103 do_revive=False)
104 # pull the updated mon db
105 log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore))
106 local_mstore = tempfile.mkdtemp()
107 teuthology.pull_directory(osd, osd_mstore, local_mstore)
108 log.info('rm -rf osd:{0}'.format(osd_mstore))
109 osd.run(args=['sudo', 'rm', '-fr', osd_mstore])
110
111 # recover the first_mon with re-built mon db
112 # pull from recovered leveldb from client
113 mon_store_dir = os.path.join('/var/lib/ceph/mon',
114 '{0}-{1}'.format(cluster_name, mon_id))
31f18b77 115 _push_directory(local_mstore, mon, mon_store_dir)
7c673cae
FG
116 mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
117 shutil.rmtree(local_mstore)
31f18b77 118
7c673cae
FG
119 # fill up the caps in the keyring file
120 mon.run(args=['sudo',
121 'ceph-authtool', keyring_path,
122 '-n', 'mon.',
123 '--cap', 'mon', 'allow *'])
124 mon.run(args=['sudo',
125 'ceph-authtool', keyring_path,
126 '-n', 'client.admin',
127 '--cap', 'mon', 'allow *',
128 '--cap', 'osd', 'allow *',
31f18b77
FG
129 '--cap', 'mds', 'allow *',
130 '--cap', 'mgr', 'allow *'])
7c673cae
FG
131 mon.run(args=['sudo', '-u', 'ceph',
132 'ceph-monstore-tool', mon_store_dir,
133 'rebuild', '--', '--keyring',
134 keyring_path])
135
31f18b77
FG
136
137def _revive_mons(manager, mons, recovered, keyring_path):
7c673cae
FG
138 # revive monitors
139 # the initial monmap is in the ceph.conf, so we are good.
140 n_mons = 0
31f18b77 141 is_mon = teuthology.is_type('mon')
7c673cae 142 for remote, roles in mons.remotes.iteritems():
7c673cae
FG
143 for role in roles:
144 if not is_mon(role):
145 continue
146 cluster, _, m = teuthology.split_role(role)
31f18b77 147 if recovered != m:
7c673cae
FG
148 log.info('running mkfs on {cluster}:mon.{mon}'.format(
149 cluster=cluster,
150 mon=m))
151 remote.run(
152 args=[
153 'sudo',
154 'ceph-mon',
155 '--cluster', cluster,
156 '--mkfs',
157 '-i', m,
158 '--keyring', keyring_path])
31f18b77 159 log.info('reviving mon.{0}'.format(m))
7c673cae
FG
160 manager.revive_mon(m)
161 n_mons += 1
7c673cae 162 manager.wait_for_mon_quorum_size(n_mons, timeout=30)
31f18b77
FG
163
164
165def _revive_mgrs(ctx, manager):
166 is_mgr = teuthology.is_type('mgr')
167 mgrs = ctx.cluster.only(is_mgr)
168 for _, roles in mgrs.remotes.iteritems():
169 for role in roles:
170 if not is_mgr(role):
171 continue
172 _, _, mgr_id = teuthology.split_role(role)
173 log.info('reviving mgr.{0}'.format(mgr_id))
174 manager.revive_mgr(mgr_id)
175
176
177def _revive_osds(ctx, manager):
178 is_osd = teuthology.is_type('osd')
179 osds = ctx.cluster.only(is_osd)
180 for _, roles in osds.remotes.iteritems():
7c673cae
FG
181 for role in roles:
182 if not is_osd(role):
183 continue
184 _, _, osd_id = teuthology.split_role(role)
185 log.info('reviving osd.{0}'.format(osd_id))
186 manager.revive_osd(osd_id)
31f18b77
FG
187
188
189def task(ctx, config):
190 """
191 Test monitor recovery from OSD
192 """
193 if config is None:
194 config = {}
195 assert isinstance(config, dict), \
196 'task only accepts a dict for configuration'
197
198 first_mon = teuthology.get_first_mon(ctx, config)
199 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
200 manager = ceph_manager.CephManager(
201 mon,
202 ctx=ctx,
203 logger=log.getChild('ceph_manager'))
204
205 mons = ctx.cluster.only(teuthology.is_type('mon'))
206 # note down the first cluster_name and mon_id
207 # we will recover it later on
208 cluster_name, _, mon_id = teuthology.split_role(first_mon)
209 _nuke_mons(manager, mons, mon_id)
210 default_keyring = '/etc/ceph/{cluster}.keyring'.format(
211 cluster=cluster_name)
212 keyring_path = config.get('keyring_path', default_keyring)
213 _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path)
214 _revive_mons(manager, mons, mon_id, keyring_path)
215 _revive_mgrs(ctx, manager)
216 _revive_osds(ctx, manager)