]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Test if we can recover the leveldb from OSD after where all leveldbs are | |
3 | corrupted | |
4 | """ | |
5 | ||
6 | import logging | |
7 | import os.path | |
8 | import shutil | |
9 | import tempfile | |
10 | ||
11 | import ceph_manager | |
12 | from teuthology import misc as teuthology | |
13 | ||
14 | log = logging.getLogger(__name__) | |
15 | ||
16 | ||
31f18b77 | 17 | def _push_directory(path, remote, remote_dir): |
7c673cae FG |
18 | """ |
19 | local_temp_path=`mktemp` | |
20 | tar czf $local_temp_path $path | |
21 | ssh remote mkdir -p remote_dir | |
22 | remote_temp_path=`mktemp` | |
23 | scp $local_temp_path $remote_temp_path | |
24 | rm $local_temp_path | |
25 | tar xzf $remote_temp_path -C $remote_dir | |
26 | ssh remote:$remote_temp_path | |
27 | """ | |
28 | fd, local_temp_path = tempfile.mkstemp(suffix='.tgz', | |
29 | prefix='rebuild_mondb-') | |
30 | os.close(fd) | |
31 | cmd = ' '.join(['tar', 'cz', | |
32 | '-f', local_temp_path, | |
33 | '-C', path, | |
34 | '--', '.']) | |
35 | teuthology.sh(cmd) | |
36 | _, fname = os.path.split(local_temp_path) | |
37 | fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz', | |
38 | prefix='rebuild_mondb-') | |
39 | os.close(fd) | |
40 | remote.put_file(local_temp_path, remote_temp_path) | |
41 | os.remove(local_temp_path) | |
42 | remote.run(args=['sudo', | |
43 | 'tar', 'xz', | |
44 | '-C', remote_dir, | |
45 | '-f', remote_temp_path]) | |
46 | remote.run(args=['sudo', 'rm', '-fr', remote_temp_path]) | |
47 | ||
48 | ||
31f18b77 | 49 | def _nuke_mons(manager, mons, mon_id): |
7c673cae | 50 | assert mons |
31f18b77 | 51 | is_mon = teuthology.is_type('mon') |
7c673cae | 52 | for remote, roles in mons.remotes.iteritems(): |
7c673cae FG |
53 | for role in roles: |
54 | if not is_mon(role): | |
55 | continue | |
56 | cluster, _, m = teuthology.split_role(role) | |
7c673cae FG |
57 | log.info('killing {cluster}:mon.{mon}'.format( |
58 | cluster=cluster, | |
59 | mon=m)) | |
60 | manager.kill_mon(m) | |
61 | mon_data = os.path.join('/var/lib/ceph/mon/', | |
31f18b77 | 62 | '{0}-{1}'.format(cluster, m)) |
7c673cae FG |
63 | if m == mon_id: |
64 | # so we will only need to recreate the store.db for the | |
65 | # first mon, would be easier than mkfs on it then replace | |
66 | # the its store.db with the recovered one | |
67 | store_dir = os.path.join(mon_data, 'store.db') | |
68 | remote.run(args=['sudo', 'rm', '-r', store_dir]) | |
69 | else: | |
70 | remote.run(args=['sudo', 'rm', '-r', mon_data]) | |
71 | ||
31f18b77 FG |
72 | |
73 | def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path): | |
7c673cae FG |
74 | local_mstore = tempfile.mkdtemp() |
75 | ||
76 | # collect the maps from all OSDs | |
31f18b77 FG |
77 | is_osd = teuthology.is_type('osd') |
78 | osds = ctx.cluster.only(is_osd) | |
7c673cae FG |
79 | assert osds |
80 | for osd, roles in osds.remotes.iteritems(): | |
7c673cae FG |
81 | for role in roles: |
82 | if not is_osd(role): | |
83 | continue | |
84 | cluster, _, osd_id = teuthology.split_role(role) | |
85 | assert cluster_name == cluster | |
86 | log.info('collecting maps from {cluster}:osd.{osd}'.format( | |
87 | cluster=cluster, | |
88 | osd=osd_id)) | |
89 | # push leveldb to OSD | |
90 | osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store') | |
91 | osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore]) | |
92 | ||
31f18b77 | 93 | _push_directory(local_mstore, osd, osd_mstore) |
7c673cae FG |
94 | log.info('rm -rf {0}'.format(local_mstore)) |
95 | shutil.rmtree(local_mstore) | |
96 | # update leveldb with OSD data | |
97 | options = '--op update-mon-db --mon-store-path {0}' | |
98 | log.info('cot {0}'.format(osd_mstore)) | |
99 | manager.objectstore_tool(pool=None, | |
100 | options=options.format(osd_mstore), | |
101 | args='', | |
102 | osd=osd_id, | |
103 | do_revive=False) | |
104 | # pull the updated mon db | |
105 | log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore)) | |
106 | local_mstore = tempfile.mkdtemp() | |
107 | teuthology.pull_directory(osd, osd_mstore, local_mstore) | |
108 | log.info('rm -rf osd:{0}'.format(osd_mstore)) | |
109 | osd.run(args=['sudo', 'rm', '-fr', osd_mstore]) | |
110 | ||
111 | # recover the first_mon with re-built mon db | |
112 | # pull from recovered leveldb from client | |
113 | mon_store_dir = os.path.join('/var/lib/ceph/mon', | |
114 | '{0}-{1}'.format(cluster_name, mon_id)) | |
31f18b77 | 115 | _push_directory(local_mstore, mon, mon_store_dir) |
7c673cae FG |
116 | mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir]) |
117 | shutil.rmtree(local_mstore) | |
31f18b77 | 118 | |
7c673cae FG |
119 | # fill up the caps in the keyring file |
120 | mon.run(args=['sudo', | |
121 | 'ceph-authtool', keyring_path, | |
122 | '-n', 'mon.', | |
123 | '--cap', 'mon', 'allow *']) | |
124 | mon.run(args=['sudo', | |
125 | 'ceph-authtool', keyring_path, | |
126 | '-n', 'client.admin', | |
127 | '--cap', 'mon', 'allow *', | |
128 | '--cap', 'osd', 'allow *', | |
31f18b77 FG |
129 | '--cap', 'mds', 'allow *', |
130 | '--cap', 'mgr', 'allow *']) | |
7c673cae FG |
131 | mon.run(args=['sudo', '-u', 'ceph', |
132 | 'ceph-monstore-tool', mon_store_dir, | |
133 | 'rebuild', '--', '--keyring', | |
134 | keyring_path]) | |
135 | ||
31f18b77 FG |
136 | |
137 | def _revive_mons(manager, mons, recovered, keyring_path): | |
7c673cae FG |
138 | # revive monitors |
139 | # the initial monmap is in the ceph.conf, so we are good. | |
140 | n_mons = 0 | |
31f18b77 | 141 | is_mon = teuthology.is_type('mon') |
7c673cae | 142 | for remote, roles in mons.remotes.iteritems(): |
7c673cae FG |
143 | for role in roles: |
144 | if not is_mon(role): | |
145 | continue | |
146 | cluster, _, m = teuthology.split_role(role) | |
31f18b77 | 147 | if recovered != m: |
7c673cae FG |
148 | log.info('running mkfs on {cluster}:mon.{mon}'.format( |
149 | cluster=cluster, | |
150 | mon=m)) | |
151 | remote.run( | |
152 | args=[ | |
153 | 'sudo', | |
154 | 'ceph-mon', | |
155 | '--cluster', cluster, | |
156 | '--mkfs', | |
157 | '-i', m, | |
158 | '--keyring', keyring_path]) | |
31f18b77 | 159 | log.info('reviving mon.{0}'.format(m)) |
7c673cae FG |
160 | manager.revive_mon(m) |
161 | n_mons += 1 | |
7c673cae | 162 | manager.wait_for_mon_quorum_size(n_mons, timeout=30) |
31f18b77 FG |
163 | |
164 | ||
165 | def _revive_mgrs(ctx, manager): | |
166 | is_mgr = teuthology.is_type('mgr') | |
167 | mgrs = ctx.cluster.only(is_mgr) | |
168 | for _, roles in mgrs.remotes.iteritems(): | |
169 | for role in roles: | |
170 | if not is_mgr(role): | |
171 | continue | |
172 | _, _, mgr_id = teuthology.split_role(role) | |
173 | log.info('reviving mgr.{0}'.format(mgr_id)) | |
174 | manager.revive_mgr(mgr_id) | |
175 | ||
176 | ||
177 | def _revive_osds(ctx, manager): | |
178 | is_osd = teuthology.is_type('osd') | |
179 | osds = ctx.cluster.only(is_osd) | |
180 | for _, roles in osds.remotes.iteritems(): | |
7c673cae FG |
181 | for role in roles: |
182 | if not is_osd(role): | |
183 | continue | |
184 | _, _, osd_id = teuthology.split_role(role) | |
185 | log.info('reviving osd.{0}'.format(osd_id)) | |
186 | manager.revive_osd(osd_id) | |
31f18b77 FG |
187 | |
188 | ||
189 | def task(ctx, config): | |
190 | """ | |
191 | Test monitor recovery from OSD | |
192 | """ | |
193 | if config is None: | |
194 | config = {} | |
195 | assert isinstance(config, dict), \ | |
196 | 'task only accepts a dict for configuration' | |
197 | ||
198 | first_mon = teuthology.get_first_mon(ctx, config) | |
199 | (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() | |
200 | manager = ceph_manager.CephManager( | |
201 | mon, | |
202 | ctx=ctx, | |
203 | logger=log.getChild('ceph_manager')) | |
204 | ||
205 | mons = ctx.cluster.only(teuthology.is_type('mon')) | |
206 | # note down the first cluster_name and mon_id | |
207 | # we will recover it later on | |
208 | cluster_name, _, mon_id = teuthology.split_role(first_mon) | |
209 | _nuke_mons(manager, mons, mon_id) | |
210 | default_keyring = '/etc/ceph/{cluster}.keyring'.format( | |
211 | cluster=cluster_name) | |
212 | keyring_path = config.get('keyring_path', default_keyring) | |
213 | _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path) | |
214 | _revive_mons(manager, mons, mon_id, keyring_path) | |
215 | _revive_mgrs(ctx, manager) | |
216 | _revive_osds(ctx, manager) |