]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | """ |
2 | Test if we can recover the leveldb from OSD after where all leveldbs are | |
3 | corrupted | |
4 | """ | |
5 | ||
6 | import logging | |
7 | import os.path | |
8 | import shutil | |
9 | import tempfile | |
10 | ||
11 | import ceph_manager | |
12 | from teuthology import misc as teuthology | |
13 | ||
14 | log = logging.getLogger(__name__) | |
15 | ||
16 | ||
17 | def push_directory(path, remote, remote_dir): | |
18 | """ | |
19 | local_temp_path=`mktemp` | |
20 | tar czf $local_temp_path $path | |
21 | ssh remote mkdir -p remote_dir | |
22 | remote_temp_path=`mktemp` | |
23 | scp $local_temp_path $remote_temp_path | |
24 | rm $local_temp_path | |
25 | tar xzf $remote_temp_path -C $remote_dir | |
26 | ssh remote:$remote_temp_path | |
27 | """ | |
28 | fd, local_temp_path = tempfile.mkstemp(suffix='.tgz', | |
29 | prefix='rebuild_mondb-') | |
30 | os.close(fd) | |
31 | cmd = ' '.join(['tar', 'cz', | |
32 | '-f', local_temp_path, | |
33 | '-C', path, | |
34 | '--', '.']) | |
35 | teuthology.sh(cmd) | |
36 | _, fname = os.path.split(local_temp_path) | |
37 | fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz', | |
38 | prefix='rebuild_mondb-') | |
39 | os.close(fd) | |
40 | remote.put_file(local_temp_path, remote_temp_path) | |
41 | os.remove(local_temp_path) | |
42 | remote.run(args=['sudo', | |
43 | 'tar', 'xz', | |
44 | '-C', remote_dir, | |
45 | '-f', remote_temp_path]) | |
46 | remote.run(args=['sudo', 'rm', '-fr', remote_temp_path]) | |
47 | ||
48 | ||
49 | def task(ctx, config): | |
50 | """ | |
51 | Test monitor recovery from OSD | |
52 | """ | |
53 | if config is None: | |
54 | config = {} | |
55 | assert isinstance(config, dict), \ | |
56 | 'task only accepts a dict for configuration' | |
57 | ||
58 | first_mon = teuthology.get_first_mon(ctx, config) | |
59 | (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() | |
60 | ||
61 | manager = ceph_manager.CephManager( | |
62 | mon, | |
63 | ctx=ctx, | |
64 | logger=log.getChild('ceph_manager')) | |
65 | ||
66 | mons = ctx.cluster.only(teuthology.is_type('mon')) | |
67 | assert mons | |
68 | # note down the first cluster_name and mon_id | |
69 | # we will recover it later on | |
70 | cluster_name = None | |
71 | mon_id = None | |
72 | for remote, roles in mons.remotes.iteritems(): | |
73 | is_mon = teuthology.is_type('mon') | |
74 | for role in roles: | |
75 | if not is_mon(role): | |
76 | continue | |
77 | cluster, _, m = teuthology.split_role(role) | |
78 | if cluster_name is None: | |
79 | cluster_name = cluster | |
80 | mon_id = m | |
81 | assert cluster_name == cluster | |
82 | log.info('killing {cluster}:mon.{mon}'.format( | |
83 | cluster=cluster, | |
84 | mon=m)) | |
85 | manager.kill_mon(m) | |
86 | mon_data = os.path.join('/var/lib/ceph/mon/', | |
87 | '{0}-{1}'.format(cluster_name, m)) | |
88 | if m == mon_id: | |
89 | # so we will only need to recreate the store.db for the | |
90 | # first mon, would be easier than mkfs on it then replace | |
91 | # the its store.db with the recovered one | |
92 | store_dir = os.path.join(mon_data, 'store.db') | |
93 | remote.run(args=['sudo', 'rm', '-r', store_dir]) | |
94 | else: | |
95 | remote.run(args=['sudo', 'rm', '-r', mon_data]) | |
96 | ||
97 | local_mstore = tempfile.mkdtemp() | |
98 | ||
99 | # collect the maps from all OSDs | |
100 | osds = ctx.cluster.only(teuthology.is_type('osd')) | |
101 | assert osds | |
102 | for osd, roles in osds.remotes.iteritems(): | |
103 | is_osd = teuthology.is_type('osd') | |
104 | for role in roles: | |
105 | if not is_osd(role): | |
106 | continue | |
107 | cluster, _, osd_id = teuthology.split_role(role) | |
108 | assert cluster_name == cluster | |
109 | log.info('collecting maps from {cluster}:osd.{osd}'.format( | |
110 | cluster=cluster, | |
111 | osd=osd_id)) | |
112 | # push leveldb to OSD | |
113 | osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store') | |
114 | osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore]) | |
115 | ||
116 | push_directory(local_mstore, osd, osd_mstore) | |
117 | log.info('rm -rf {0}'.format(local_mstore)) | |
118 | shutil.rmtree(local_mstore) | |
119 | # update leveldb with OSD data | |
120 | options = '--op update-mon-db --mon-store-path {0}' | |
121 | log.info('cot {0}'.format(osd_mstore)) | |
122 | manager.objectstore_tool(pool=None, | |
123 | options=options.format(osd_mstore), | |
124 | args='', | |
125 | osd=osd_id, | |
126 | do_revive=False) | |
127 | # pull the updated mon db | |
128 | log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore)) | |
129 | local_mstore = tempfile.mkdtemp() | |
130 | teuthology.pull_directory(osd, osd_mstore, local_mstore) | |
131 | log.info('rm -rf osd:{0}'.format(osd_mstore)) | |
132 | osd.run(args=['sudo', 'rm', '-fr', osd_mstore]) | |
133 | ||
134 | # recover the first_mon with re-built mon db | |
135 | # pull from recovered leveldb from client | |
136 | mon_store_dir = os.path.join('/var/lib/ceph/mon', | |
137 | '{0}-{1}'.format(cluster_name, mon_id)) | |
138 | push_directory(local_mstore, mon, mon_store_dir) | |
139 | mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir]) | |
140 | shutil.rmtree(local_mstore) | |
141 | default_keyring = '/etc/ceph/{cluster}.keyring'.format( | |
142 | cluster=cluster_name) | |
143 | keyring_path = config.get('keyring_path', default_keyring) | |
144 | # fill up the caps in the keyring file | |
145 | mon.run(args=['sudo', | |
146 | 'ceph-authtool', keyring_path, | |
147 | '-n', 'mon.', | |
148 | '--cap', 'mon', 'allow *']) | |
149 | mon.run(args=['sudo', | |
150 | 'ceph-authtool', keyring_path, | |
151 | '-n', 'client.admin', | |
152 | '--cap', 'mon', 'allow *', | |
153 | '--cap', 'osd', 'allow *', | |
154 | '--cap', 'mds', 'allow *']) | |
155 | mon.run(args=['sudo', '-u', 'ceph', | |
156 | 'ceph-monstore-tool', mon_store_dir, | |
157 | 'rebuild', '--', '--keyring', | |
158 | keyring_path]) | |
159 | ||
160 | # revive monitors | |
161 | # the initial monmap is in the ceph.conf, so we are good. | |
162 | n_mons = 0 | |
163 | for remote, roles in mons.remotes.iteritems(): | |
164 | is_mon = teuthology.is_type('mon') | |
165 | for role in roles: | |
166 | if not is_mon(role): | |
167 | continue | |
168 | cluster, _, m = teuthology.split_role(role) | |
169 | assert cluster_name == cluster | |
170 | if mon_id != m: | |
171 | log.info('running mkfs on {cluster}:mon.{mon}'.format( | |
172 | cluster=cluster, | |
173 | mon=m)) | |
174 | remote.run( | |
175 | args=[ | |
176 | 'sudo', | |
177 | 'ceph-mon', | |
178 | '--cluster', cluster, | |
179 | '--mkfs', | |
180 | '-i', m, | |
181 | '--keyring', keyring_path]) | |
182 | manager.revive_mon(m) | |
183 | n_mons += 1 | |
184 | ||
185 | manager.wait_for_mon_quorum_size(n_mons, timeout=30) | |
186 | for osd, roles in osds.remotes.iteritems(): | |
187 | is_osd = teuthology.is_type('osd') | |
188 | for role in roles: | |
189 | if not is_osd(role): | |
190 | continue | |
191 | _, _, osd_id = teuthology.split_role(role) | |
192 | log.info('reviving osd.{0}'.format(osd_id)) | |
193 | manager.revive_osd(osd_id) |