]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | import json |
2 | import logging | |
3 | from unittest import case | |
4 | from tasks.ceph_test_case import CephTestCase | |
5 | import os | |
6 | import re | |
7 | from StringIO import StringIO | |
8 | ||
9 | from tasks.cephfs.fuse_mount import FuseMount | |
10 | ||
11 | from teuthology.orchestra import run | |
12 | from teuthology.orchestra.run import CommandFailedError | |
13 | ||
14 | ||
15 | log = logging.getLogger(__name__) | |
16 | ||
17 | ||
18 | def for_teuthology(f): | |
19 | """ | |
20 | Decorator that adds an "is_for_teuthology" attribute to the wrapped function | |
21 | """ | |
22 | f.is_for_teuthology = True | |
23 | return f | |
24 | ||
25 | ||
26 | def needs_trimming(f): | |
27 | """ | |
28 | Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse | |
29 | this means it needs to be able to run as root, currently) | |
30 | """ | |
31 | f.needs_trimming = True | |
32 | return f | |
33 | ||
34 | ||
35 | class CephFSTestCase(CephTestCase): | |
36 | """ | |
37 | Test case for Ceph FS, requires caller to populate Filesystem and Mounts, | |
38 | into the fs, mount_a, mount_b class attributes (setting mount_b is optional) | |
39 | ||
40 | Handles resetting the cluster under test between tests. | |
41 | """ | |
42 | ||
43 | # FIXME weird explicit naming | |
44 | mount_a = None | |
45 | mount_b = None | |
181888fb | 46 | recovery_mount = None |
7c673cae FG |
47 | |
48 | # Declarative test requirements: subclasses should override these to indicate | |
49 | # their special needs. If not met, tests will be skipped. | |
50 | CLIENTS_REQUIRED = 1 | |
51 | MDSS_REQUIRED = 1 | |
52 | REQUIRE_KCLIENT_REMOTE = False | |
53 | REQUIRE_ONE_CLIENT_REMOTE = False | |
54 | REQUIRE_MEMSTORE = False | |
55 | ||
56 | # Whether to create the default filesystem during setUp | |
57 | REQUIRE_FILESYSTEM = True | |
58 | ||
181888fb FG |
59 | # requires REQUIRE_FILESYSTEM = True |
60 | REQUIRE_RECOVERY_FILESYSTEM = False | |
61 | ||
7c673cae FG |
62 | LOAD_SETTINGS = [] |
63 | ||
64 | def setUp(self): | |
65 | super(CephFSTestCase, self).setUp() | |
66 | ||
67 | if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED: | |
68 | raise case.SkipTest("Only have {0} MDSs, require {1}".format( | |
69 | len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED | |
70 | )) | |
71 | ||
72 | if len(self.mounts) < self.CLIENTS_REQUIRED: | |
73 | raise case.SkipTest("Only have {0} clients, require {1}".format( | |
74 | len(self.mounts), self.CLIENTS_REQUIRED | |
75 | )) | |
76 | ||
77 | if self.REQUIRE_KCLIENT_REMOTE: | |
78 | if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount): | |
79 | # kclient kill() power cycles nodes, so requires clients to each be on | |
80 | # their own node | |
81 | if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname: | |
82 | raise case.SkipTest("kclient clients must be on separate nodes") | |
83 | ||
84 | if self.REQUIRE_ONE_CLIENT_REMOTE: | |
85 | if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames(): | |
86 | raise case.SkipTest("Require first client to be on separate server from MDSs") | |
87 | ||
88 | if self.REQUIRE_MEMSTORE: | |
89 | objectstore = self.mds_cluster.get_config("osd_objectstore", "osd") | |
90 | if objectstore != "memstore": | |
91 | # You certainly *could* run this on a real OSD, but you don't want to sit | |
92 | # here for hours waiting for the test to fill up a 1TB drive! | |
93 | raise case.SkipTest("Require `memstore` OSD backend to simulate full drives") | |
94 | ||
95 | # Create friendly mount_a, mount_b attrs | |
96 | for i in range(0, self.CLIENTS_REQUIRED): | |
97 | setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i]) | |
98 | ||
99 | self.mds_cluster.clear_firewall() | |
100 | ||
101 | # Unmount all clients, we are about to blow away the filesystem | |
102 | for mount in self.mounts: | |
103 | if mount.is_mounted(): | |
104 | mount.umount_wait(force=True) | |
105 | ||
106 | # To avoid any issues with e.g. unlink bugs, we destroy and recreate | |
107 | # the filesystem rather than just doing a rm -rf of files | |
108 | self.mds_cluster.mds_stop() | |
31f18b77 | 109 | self.mds_cluster.mds_fail() |
7c673cae FG |
110 | self.mds_cluster.delete_all_filesystems() |
111 | self.fs = None # is now invalid! | |
181888fb | 112 | self.recovery_fs = None |
7c673cae | 113 | |
7c673cae FG |
114 | # In case anything is in the OSD blacklist list, clear it out. This is to avoid |
115 | # the OSD map changing in the background (due to blacklist expiry) while tests run. | |
116 | try: | |
117 | self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear") | |
118 | except CommandFailedError: | |
119 | # Fallback for older Ceph cluster | |
120 | blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd", | |
121 | "dump", "--format=json-pretty"))['blacklist'] | |
122 | log.info("Removing {0} blacklist entries".format(len(blacklist))) | |
123 | for addr, blacklisted_at in blacklist.items(): | |
124 | self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr) | |
125 | ||
126 | client_mount_ids = [m.client_id for m in self.mounts] | |
127 | # In case the test changes the IDs of clients, stash them so that we can | |
128 | # reset in tearDown | |
129 | self._original_client_ids = client_mount_ids | |
130 | log.info(client_mount_ids) | |
131 | ||
132 | # In case there were any extra auth identities around from a previous | |
133 | # test, delete them | |
134 | for entry in self.auth_list(): | |
135 | ent_type, ent_id = entry['entity'].split(".") | |
136 | if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin": | |
137 | self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity']) | |
138 | ||
139 | if self.REQUIRE_FILESYSTEM: | |
181888fb | 140 | self.fs = self.mds_cluster.newfs(create=True) |
7c673cae FG |
141 | self.fs.mds_restart() |
142 | ||
143 | # In case some test messed with auth caps, reset them | |
144 | for client_id in client_mount_ids: | |
145 | self.mds_cluster.mon_manager.raw_cluster_cmd_result( | |
146 | 'auth', 'caps', "client.{0}".format(client_id), | |
147 | 'mds', 'allow', | |
148 | 'mon', 'allow r', | |
149 | 'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name())) | |
150 | ||
151 | # wait for mds restart to complete... | |
152 | self.fs.wait_for_daemons() | |
153 | ||
154 | # Mount the requested number of clients | |
155 | for i in range(0, self.CLIENTS_REQUIRED): | |
156 | self.mounts[i].mount() | |
157 | self.mounts[i].wait_until_mounted() | |
158 | ||
181888fb FG |
159 | if self.REQUIRE_RECOVERY_FILESYSTEM: |
160 | if not self.REQUIRE_FILESYSTEM: | |
161 | raise case.SkipTest("Recovery filesystem requires a primary filesystem as well") | |
162 | self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set', | |
163 | 'enable_multiple', 'true', | |
164 | '--yes-i-really-mean-it') | |
165 | self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False) | |
166 | self.recovery_fs.set_metadata_overlay(True) | |
167 | self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name()) | |
168 | self.recovery_fs.create() | |
169 | self.recovery_fs.getinfo(refresh=True) | |
170 | self.recovery_fs.mds_restart() | |
171 | self.recovery_fs.wait_for_daemons() | |
172 | ||
7c673cae FG |
173 | # Load an config settings of interest |
174 | for setting in self.LOAD_SETTINGS: | |
c07f9fc5 | 175 | setattr(self, setting, float(self.fs.mds_asok( |
7c673cae FG |
176 | ['config', 'get', setting], self.mds_cluster.mds_ids[0] |
177 | )[setting])) | |
178 | ||
179 | self.configs_set = set() | |
180 | ||
181 | def tearDown(self): | |
182 | super(CephFSTestCase, self).tearDown() | |
183 | ||
184 | self.mds_cluster.clear_firewall() | |
185 | for m in self.mounts: | |
186 | m.teardown() | |
187 | ||
188 | for i, m in enumerate(self.mounts): | |
189 | m.client_id = self._original_client_ids[i] | |
190 | ||
191 | for subsys, key in self.configs_set: | |
192 | self.mds_cluster.clear_ceph_conf(subsys, key) | |
193 | ||
194 | def set_conf(self, subsys, key, value): | |
195 | self.configs_set.add((subsys, key)) | |
196 | self.mds_cluster.set_ceph_conf(subsys, key, value) | |
197 | ||
198 | def auth_list(self): | |
199 | """ | |
c07f9fc5 | 200 | Convenience wrapper on "ceph auth ls" |
7c673cae FG |
201 | """ |
202 | return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd( | |
c07f9fc5 | 203 | "auth", "ls", "--format=json-pretty" |
7c673cae FG |
204 | ))['auth_dump'] |
205 | ||
206 | def assert_session_count(self, expected, ls_data=None, mds_id=None): | |
207 | if ls_data is None: | |
208 | ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id) | |
209 | ||
31f18b77 FG |
210 | alive_count = len([s for s in ls_data if s['state'] != 'killing']) |
211 | ||
212 | self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format( | |
213 | expected, alive_count | |
7c673cae FG |
214 | )) |
215 | ||
216 | def assert_session_state(self, client_id, expected_state): | |
217 | self.assertEqual( | |
218 | self._session_by_id( | |
219 | self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'], | |
220 | expected_state) | |
221 | ||
222 | def get_session_data(self, client_id): | |
223 | return self._session_by_id(client_id) | |
224 | ||
225 | def _session_list(self): | |
226 | ls_data = self.fs.mds_asok(['session', 'ls']) | |
227 | ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']] | |
228 | return ls_data | |
229 | ||
230 | def get_session(self, client_id, session_ls=None): | |
231 | if session_ls is None: | |
232 | session_ls = self.fs.mds_asok(['session', 'ls']) | |
233 | ||
234 | return self._session_by_id(session_ls)[client_id] | |
235 | ||
236 | def _session_by_id(self, session_ls): | |
237 | return dict([(s['id'], s) for s in session_ls]) | |
238 | ||
239 | def wait_for_daemon_start(self, daemon_ids=None): | |
240 | """ | |
241 | Wait until all the daemons appear in the FSMap, either assigned | |
242 | MDS ranks or in the list of standbys | |
243 | """ | |
244 | def get_daemon_names(): | |
245 | return [info['name'] for info in self.mds_cluster.status().get_all()] | |
246 | ||
247 | if daemon_ids is None: | |
248 | daemon_ids = self.mds_cluster.mds_ids | |
249 | ||
250 | try: | |
251 | self.wait_until_true( | |
252 | lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids), | |
253 | timeout=30 | |
254 | ) | |
255 | except RuntimeError: | |
256 | log.warn("Timeout waiting for daemons {0}, while we have {1}".format( | |
257 | daemon_ids, get_daemon_names() | |
258 | )) | |
259 | raise | |
260 | ||
261 | def assert_mds_crash(self, daemon_id): | |
262 | """ | |
263 | Assert that the a particular MDS daemon crashes (block until | |
264 | it does) | |
265 | """ | |
266 | try: | |
267 | self.mds_cluster.mds_daemons[daemon_id].proc.wait() | |
268 | except CommandFailedError as e: | |
269 | log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus)) | |
270 | self.mds_cluster.mds_daemons[daemon_id].proc = None | |
271 | ||
272 | # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will | |
273 | # catch it later and treat it as a failure. | |
274 | p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[ | |
275 | "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO()) | |
276 | core_pattern = p.stdout.getvalue().strip() | |
277 | if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it | |
278 | # We have seen a core_pattern that looks like it's from teuthology's coredump | |
279 | # task, so proceed to clear out the core file | |
280 | log.info("Clearing core from pattern: {0}".format(core_pattern)) | |
281 | ||
282 | # Determine the PID of the crashed MDS by inspecting the MDSMap, it had | |
283 | # to talk to the mons to get assigned a rank to reach the point of crashing | |
284 | addr = self.mds_cluster.mon_manager.get_mds_status(daemon_id)['addr'] | |
285 | pid_str = addr.split("/")[1] | |
286 | log.info("Determined crasher PID was {0}".format(pid_str)) | |
287 | ||
288 | # Substitute PID into core_pattern to get a glob | |
289 | core_glob = core_pattern.replace("%p", pid_str) | |
290 | core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens | |
291 | ||
292 | # Verify that we see the expected single coredump matching the expected pattern | |
293 | ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[ | |
294 | "sudo", "ls", run.Raw(core_glob) | |
295 | ], stdout=StringIO()) | |
296 | cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f] | |
297 | log.info("Enumerated cores: {0}".format(cores)) | |
298 | self.assertEqual(len(cores), 1) | |
299 | ||
300 | log.info("Found core file {0}, deleting it".format(cores[0])) | |
301 | ||
302 | self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[ | |
303 | "sudo", "rm", "-f", cores[0] | |
304 | ]) | |
305 | else: | |
306 | log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)") | |
307 | ||
308 | else: | |
309 | raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id)) |