]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | |
2 | """ | |
3 | Teuthology task for exercising CephFS client recovery | |
4 | """ | |
5 | ||
6 | import logging | |
7 | from textwrap import dedent | |
8 | import time | |
9 | import distutils.version as version | |
10 | import re | |
11 | import os | |
12 | ||
13 | from teuthology.orchestra.run import CommandFailedError, ConnectionLostError | |
28e407b8 | 14 | from tasks.cephfs.fuse_mount import FuseMount |
7c673cae FG |
15 | from tasks.cephfs.cephfs_test_case import CephFSTestCase |
16 | from teuthology.packaging import get_package_version | |
28e407b8 | 17 | from unittest import SkipTest |
7c673cae FG |
18 | |
19 | ||
20 | log = logging.getLogger(__name__) | |
21 | ||
22 | ||
23 | # Arbitrary timeouts for operations involving restarting | |
24 | # an MDS or waiting for it to come up | |
25 | MDS_RESTART_GRACE = 60 | |
26 | ||
27 | ||
28 | class TestClientNetworkRecovery(CephFSTestCase): | |
29 | REQUIRE_KCLIENT_REMOTE = True | |
30 | REQUIRE_ONE_CLIENT_REMOTE = True | |
31 | CLIENTS_REQUIRED = 2 | |
32 | ||
33 | LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"] | |
34 | ||
35 | # Environment references | |
36 | mds_session_timeout = None | |
37 | mds_reconnect_timeout = None | |
38 | ms_max_backoff = None | |
39 | ||
40 | def test_network_death(self): | |
41 | """ | |
42 | Simulate software freeze or temporary network failure. | |
43 | ||
44 | Check that the client blocks I/O during failure, and completes | |
45 | I/O after failure. | |
46 | """ | |
47 | ||
48 | # We only need one client | |
49 | self.mount_b.umount_wait() | |
50 | ||
51 | # Initially our one client session should be visible | |
52 | client_id = self.mount_a.get_global_id() | |
53 | ls_data = self._session_list() | |
54 | self.assert_session_count(1, ls_data) | |
55 | self.assertEqual(ls_data[0]['id'], client_id) | |
56 | self.assert_session_state(client_id, "open") | |
57 | ||
58 | # ...and capable of doing I/O without blocking | |
59 | self.mount_a.create_files() | |
60 | ||
61 | # ...but if we turn off the network | |
62 | self.fs.set_clients_block(True) | |
63 | ||
64 | # ...and try and start an I/O | |
65 | write_blocked = self.mount_a.write_background() | |
66 | ||
67 | # ...then it should block | |
68 | self.assertFalse(write_blocked.finished) | |
69 | self.assert_session_state(client_id, "open") | |
70 | time.sleep(self.mds_session_timeout * 1.5) # Long enough for MDS to consider session stale | |
71 | self.assertFalse(write_blocked.finished) | |
72 | self.assert_session_state(client_id, "stale") | |
73 | ||
74 | # ...until we re-enable I/O | |
75 | self.fs.set_clients_block(False) | |
76 | ||
77 | # ...when it should complete promptly | |
78 | a = time.time() | |
79 | self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2) | |
80 | write_blocked.wait() # Already know we're finished, wait() to raise exception on errors | |
81 | recovery_time = time.time() - a | |
82 | log.info("recovery time: {0}".format(recovery_time)) | |
83 | self.assert_session_state(client_id, "open") | |
84 | ||
85 | ||
86 | class TestClientRecovery(CephFSTestCase): | |
87 | REQUIRE_KCLIENT_REMOTE = True | |
88 | CLIENTS_REQUIRED = 2 | |
89 | ||
90 | LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"] | |
91 | ||
92 | # Environment references | |
93 | mds_session_timeout = None | |
94 | mds_reconnect_timeout = None | |
95 | ms_max_backoff = None | |
96 | ||
97 | def test_basic(self): | |
98 | # Check that two clients come up healthy and see each others' files | |
99 | # ===================================================== | |
100 | self.mount_a.create_files() | |
101 | self.mount_a.check_files() | |
102 | self.mount_a.umount_wait() | |
103 | ||
104 | self.mount_b.check_files() | |
105 | ||
106 | self.mount_a.mount() | |
107 | self.mount_a.wait_until_mounted() | |
108 | ||
109 | # Check that the admin socket interface is correctly reporting | |
110 | # two sessions | |
111 | # ===================================================== | |
112 | ls_data = self._session_list() | |
113 | self.assert_session_count(2, ls_data) | |
114 | ||
115 | self.assertSetEqual( | |
116 | set([l['id'] for l in ls_data]), | |
117 | {self.mount_a.get_global_id(), self.mount_b.get_global_id()} | |
118 | ) | |
119 | ||
120 | def test_restart(self): | |
121 | # Check that after an MDS restart both clients reconnect and continue | |
122 | # to handle I/O | |
123 | # ===================================================== | |
124 | self.fs.mds_fail_restart() | |
125 | self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) | |
126 | ||
127 | self.mount_a.create_destroy() | |
128 | self.mount_b.create_destroy() | |
129 | ||
130 | def _session_num_caps(self, client_id): | |
131 | ls_data = self.fs.mds_asok(['session', 'ls']) | |
132 | return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps']) | |
133 | ||
134 | def test_reconnect_timeout(self): | |
135 | # Reconnect timeout | |
136 | # ================= | |
137 | # Check that if I stop an MDS and a client goes away, the MDS waits | |
138 | # for the reconnect period | |
139 | self.fs.mds_stop() | |
140 | self.fs.mds_fail() | |
141 | ||
142 | mount_a_client_id = self.mount_a.get_global_id() | |
143 | self.mount_a.umount_wait(force=True) | |
144 | ||
145 | self.fs.mds_restart() | |
146 | ||
147 | self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) | |
148 | # Check that the MDS locally reports its state correctly | |
149 | status = self.fs.mds_asok(['status']) | |
150 | self.assertIn("reconnect_status", status) | |
151 | ||
152 | ls_data = self._session_list() | |
153 | self.assert_session_count(2, ls_data) | |
154 | ||
155 | # The session for the dead client should have the 'reconnect' flag set | |
156 | self.assertTrue(self.get_session(mount_a_client_id)['reconnecting']) | |
157 | ||
158 | # Wait for the reconnect state to clear, this should take the | |
159 | # reconnect timeout period. | |
160 | in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2) | |
161 | # Check that the period we waited to enter active is within a factor | |
162 | # of two of the reconnect timeout. | |
163 | self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout / 2, | |
164 | "Should have been in reconnect phase for {0} but only took {1}".format( | |
165 | self.mds_reconnect_timeout, in_reconnect_for | |
166 | )) | |
167 | ||
168 | self.assert_session_count(1) | |
169 | ||
170 | # Check that the client that timed out during reconnect can | |
171 | # mount again and do I/O | |
172 | self.mount_a.mount() | |
173 | self.mount_a.wait_until_mounted() | |
174 | self.mount_a.create_destroy() | |
175 | ||
176 | self.assert_session_count(2) | |
177 | ||
178 | def test_reconnect_eviction(self): | |
179 | # Eviction during reconnect | |
180 | # ========================= | |
181 | mount_a_client_id = self.mount_a.get_global_id() | |
182 | ||
183 | self.fs.mds_stop() | |
184 | self.fs.mds_fail() | |
185 | ||
186 | # The mount goes away while the MDS is offline | |
187 | self.mount_a.kill() | |
188 | ||
189 | self.fs.mds_restart() | |
190 | ||
191 | # Enter reconnect phase | |
192 | self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) | |
193 | self.assert_session_count(2) | |
194 | ||
195 | # Evict the stuck client | |
196 | self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) | |
197 | self.assert_session_count(1) | |
198 | ||
199 | # Observe that we proceed to active phase without waiting full reconnect timeout | |
200 | evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) | |
201 | # Once we evict the troublemaker, the reconnect phase should complete | |
202 | # in well under the reconnect timeout. | |
203 | self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5, | |
204 | "reconnect did not complete soon enough after eviction, took {0}".format( | |
205 | evict_til_active | |
206 | )) | |
207 | ||
208 | # We killed earlier so must clean up before trying to use again | |
209 | self.mount_a.kill_cleanup() | |
210 | ||
211 | # Bring the client back | |
212 | self.mount_a.mount() | |
213 | self.mount_a.wait_until_mounted() | |
214 | self.mount_a.create_destroy() | |
215 | ||
216 | def test_stale_caps(self): | |
217 | # Capability release from stale session | |
218 | # ===================================== | |
219 | cap_holder = self.mount_a.open_background() | |
220 | ||
221 | # Wait for the file to be visible from another client, indicating | |
222 | # that mount_a has completed its network ops | |
223 | self.mount_b.wait_for_visible() | |
224 | ||
225 | # Simulate client death | |
226 | self.mount_a.kill() | |
227 | ||
228 | try: | |
229 | # Now, after mds_session_timeout seconds, the waiter should | |
230 | # complete their operation when the MDS marks the holder's | |
231 | # session stale. | |
232 | cap_waiter = self.mount_b.write_background() | |
233 | a = time.time() | |
234 | cap_waiter.wait() | |
235 | b = time.time() | |
236 | ||
237 | # Should have succeeded | |
238 | self.assertEqual(cap_waiter.exitstatus, 0) | |
239 | ||
240 | cap_waited = b - a | |
241 | log.info("cap_waiter waited {0}s".format(cap_waited)) | |
242 | self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0, | |
243 | "Capability handover took {0}, expected approx {1}".format( | |
244 | cap_waited, self.mds_session_timeout | |
245 | )) | |
246 | ||
247 | cap_holder.stdin.close() | |
248 | try: | |
249 | cap_holder.wait() | |
250 | except (CommandFailedError, ConnectionLostError): | |
251 | # We killed it (and possibly its node), so it raises an error | |
252 | pass | |
253 | finally: | |
254 | # teardown() doesn't quite handle this case cleanly, so help it out | |
255 | self.mount_a.kill_cleanup() | |
256 | ||
257 | self.mount_a.mount() | |
258 | self.mount_a.wait_until_mounted() | |
259 | ||
260 | def test_evicted_caps(self): | |
261 | # Eviction while holding a capability | |
262 | # =================================== | |
263 | ||
264 | # Take out a write capability on a file on client A, | |
265 | # and then immediately kill it. | |
266 | cap_holder = self.mount_a.open_background() | |
267 | mount_a_client_id = self.mount_a.get_global_id() | |
268 | ||
269 | # Wait for the file to be visible from another client, indicating | |
270 | # that mount_a has completed its network ops | |
271 | self.mount_b.wait_for_visible() | |
272 | ||
273 | # Simulate client death | |
274 | self.mount_a.kill() | |
275 | ||
276 | try: | |
277 | # The waiter should get stuck waiting for the capability | |
278 | # held on the MDS by the now-dead client A | |
279 | cap_waiter = self.mount_b.write_background() | |
280 | time.sleep(5) | |
281 | self.assertFalse(cap_waiter.finished) | |
282 | ||
283 | self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) | |
284 | # Now, because I evicted the old holder of the capability, it should | |
285 | # immediately get handed over to the waiter | |
286 | a = time.time() | |
287 | cap_waiter.wait() | |
288 | b = time.time() | |
289 | cap_waited = b - a | |
290 | log.info("cap_waiter waited {0}s".format(cap_waited)) | |
291 | # This is the check that it happened 'now' rather than waiting | |
292 | # for the session timeout | |
293 | self.assertLess(cap_waited, self.mds_session_timeout / 2.0, | |
294 | "Capability handover took {0}, expected less than {1}".format( | |
295 | cap_waited, self.mds_session_timeout / 2.0 | |
296 | )) | |
297 | ||
298 | cap_holder.stdin.close() | |
299 | try: | |
300 | cap_holder.wait() | |
301 | except (CommandFailedError, ConnectionLostError): | |
302 | # We killed it (and possibly its node), so it raises an error | |
303 | pass | |
304 | finally: | |
305 | self.mount_a.kill_cleanup() | |
306 | ||
307 | self.mount_a.mount() | |
308 | self.mount_a.wait_until_mounted() | |
309 | ||
310 | def test_trim_caps(self): | |
311 | # Trim capability when reconnecting MDS | |
312 | # =================================== | |
313 | ||
314 | count = 500 | |
315 | # Create lots of files | |
316 | for i in range(count): | |
317 | self.mount_a.run_shell(["touch", "f{0}".format(i)]) | |
318 | ||
319 | # Populate mount_b's cache | |
31f18b77 | 320 | self.mount_b.run_shell(["ls", "-l"]) |
7c673cae FG |
321 | |
322 | client_id = self.mount_b.get_global_id() | |
323 | num_caps = self._session_num_caps(client_id) | |
324 | self.assertGreaterEqual(num_caps, count) | |
325 | ||
326 | # Restart MDS. client should trim its cache when reconnecting to the MDS | |
327 | self.fs.mds_fail_restart() | |
328 | self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) | |
329 | ||
330 | num_caps = self._session_num_caps(client_id) | |
331 | self.assertLess(num_caps, count, | |
332 | "should have less than {0} capabilities, have {1}".format( | |
333 | count, num_caps | |
334 | )) | |
335 | ||
31f18b77 | 336 | def _is_flockable(self): |
7c673cae FG |
337 | a_version_str = get_package_version(self.mount_a.client_remote, "fuse") |
338 | b_version_str = get_package_version(self.mount_b.client_remote, "fuse") | |
339 | flock_version_str = "2.9" | |
340 | ||
341 | version_regex = re.compile(r"[0-9\.]+") | |
342 | a_result = version_regex.match(a_version_str) | |
343 | self.assertTrue(a_result) | |
344 | b_result = version_regex.match(b_version_str) | |
345 | self.assertTrue(b_result) | |
346 | a_version = version.StrictVersion(a_result.group()) | |
347 | b_version = version.StrictVersion(b_result.group()) | |
348 | flock_version=version.StrictVersion(flock_version_str) | |
349 | ||
7c673cae | 350 | if (a_version >= flock_version and b_version >= flock_version): |
31f18b77 FG |
351 | log.info("flock locks are available") |
352 | return True | |
7c673cae FG |
353 | else: |
354 | log.info("not testing flock locks, machines have versions {av} and {bv}".format( | |
355 | av=a_version_str,bv=b_version_str)) | |
31f18b77 FG |
356 | return False |
357 | ||
358 | def test_filelock(self): | |
359 | """ | |
360 | Check that file lock doesn't get lost after an MDS restart | |
361 | """ | |
7c673cae | 362 | |
31f18b77 | 363 | flockable = self._is_flockable() |
7c673cae FG |
364 | lock_holder = self.mount_a.lock_background(do_flock=flockable) |
365 | ||
366 | self.mount_b.wait_for_visible("background_file-2") | |
367 | self.mount_b.check_filelock(do_flock=flockable) | |
368 | ||
369 | self.fs.mds_fail_restart() | |
370 | self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) | |
371 | ||
372 | self.mount_b.check_filelock(do_flock=flockable) | |
373 | ||
374 | # Tear down the background process | |
375 | lock_holder.stdin.close() | |
376 | try: | |
377 | lock_holder.wait() | |
378 | except (CommandFailedError, ConnectionLostError): | |
379 | # We killed it, so it raises an error | |
380 | pass | |
381 | ||
31f18b77 FG |
382 | def test_filelock_eviction(self): |
383 | """ | |
384 | Check that file lock held by evicted client is given to | |
385 | waiting client. | |
386 | """ | |
387 | if not self._is_flockable(): | |
388 | self.skipTest("flock is not available") | |
389 | ||
390 | lock_holder = self.mount_a.lock_background() | |
391 | self.mount_b.wait_for_visible("background_file-2") | |
392 | self.mount_b.check_filelock() | |
393 | ||
394 | lock_taker = self.mount_b.lock_and_release() | |
395 | # Check the taker is waiting (doesn't get it immediately) | |
396 | time.sleep(2) | |
397 | self.assertFalse(lock_holder.finished) | |
398 | self.assertFalse(lock_taker.finished) | |
399 | ||
181888fb FG |
400 | try: |
401 | mount_a_client_id = self.mount_a.get_global_id() | |
402 | self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) | |
31f18b77 | 403 | |
181888fb FG |
404 | # Evicting mount_a should let mount_b's attempt to take the lock |
405 | # succeed | |
406 | self.wait_until_true(lambda: lock_taker.finished, timeout=10) | |
407 | finally: | |
408 | # teardown() doesn't quite handle this case cleanly, so help it out | |
409 | self.mount_a.kill() | |
410 | self.mount_a.kill_cleanup() | |
411 | ||
412 | # Bring the client back | |
413 | self.mount_a.mount() | |
414 | self.mount_a.wait_until_mounted() | |
31f18b77 | 415 | |
7c673cae FG |
416 | def test_dir_fsync(self): |
417 | self._test_fsync(True); | |
418 | ||
419 | def test_create_fsync(self): | |
420 | self._test_fsync(False); | |
421 | ||
422 | def _test_fsync(self, dirfsync): | |
423 | """ | |
424 | That calls to fsync guarantee visibility of metadata to another | |
425 | client immediately after the fsyncing client dies. | |
426 | """ | |
427 | ||
428 | # Leave this guy out until he's needed | |
429 | self.mount_b.umount_wait() | |
430 | ||
431 | # Create dir + child dentry on client A, and fsync the dir | |
432 | path = os.path.join(self.mount_a.mountpoint, "subdir") | |
433 | self.mount_a.run_python( | |
434 | dedent(""" | |
435 | import os | |
436 | import time | |
437 | ||
438 | path = "{path}" | |
439 | ||
440 | print "Starting creation..." | |
441 | start = time.time() | |
442 | ||
443 | os.mkdir(path) | |
444 | dfd = os.open(path, os.O_DIRECTORY) | |
445 | ||
446 | fd = open(os.path.join(path, "childfile"), "w") | |
447 | print "Finished creation in {{0}}s".format(time.time() - start) | |
448 | ||
449 | print "Starting fsync..." | |
450 | start = time.time() | |
451 | if {dirfsync}: | |
452 | os.fsync(dfd) | |
453 | else: | |
454 | os.fsync(fd) | |
455 | print "Finished fsync in {{0}}s".format(time.time() - start) | |
456 | """.format(path=path,dirfsync=str(dirfsync))) | |
457 | ) | |
458 | ||
459 | # Immediately kill the MDS and then client A | |
460 | self.fs.mds_stop() | |
461 | self.fs.mds_fail() | |
462 | self.mount_a.kill() | |
463 | self.mount_a.kill_cleanup() | |
464 | ||
465 | # Restart the MDS. Wait for it to come up, it'll have to time out in clientreplay | |
466 | self.fs.mds_restart() | |
467 | log.info("Waiting for reconnect...") | |
468 | self.fs.wait_for_state("up:reconnect") | |
469 | log.info("Waiting for active...") | |
470 | self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout) | |
471 | log.info("Reached active...") | |
472 | ||
473 | # Is the child dentry visible from mount B? | |
474 | self.mount_b.mount() | |
475 | self.mount_b.wait_until_mounted() | |
476 | self.mount_b.run_shell(["ls", "subdir/childfile"]) | |
28e407b8 AA |
477 | |
478 | def test_stale_renew(self): | |
479 | if not isinstance(self.mount_a, FuseMount): | |
480 | raise SkipTest("Require FUSE client to handle signal STOP/CONT") | |
481 | ||
482 | self.mount_a.run_shell(["mkdir", "testdir"]) | |
483 | self.mount_a.run_shell(["touch", "testdir/file1"]) | |
484 | # populate readdir cache | |
485 | self.mount_a.run_shell(["ls", "testdir"]) | |
486 | self.mount_b.run_shell(["ls", "testdir"]) | |
487 | ||
488 | # check if readdir cache is effective | |
489 | initial_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency']) | |
490 | self.mount_b.run_shell(["ls", "testdir"]) | |
491 | current_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency']) | |
492 | self.assertEqual(current_readdirs, initial_readdirs); | |
493 | ||
494 | mount_b_gid = self.mount_b.get_global_id() | |
495 | mount_b_pid = self.mount_b.get_client_pid() | |
496 | # stop ceph-fuse process of mount_b | |
497 | self.mount_b.client_remote.run(args=["sudo", "kill", "-STOP", mount_b_pid]) | |
498 | ||
499 | self.assert_session_state(mount_b_gid, "open") | |
500 | time.sleep(self.mds_session_timeout * 1.5) # Long enough for MDS to consider session stale | |
501 | self.assert_session_state(mount_b_gid, "stale") | |
502 | ||
503 | self.mount_a.run_shell(["touch", "testdir/file2"]) | |
504 | ||
505 | # resume ceph-fuse process of mount_b | |
506 | self.mount_b.client_remote.run(args=["sudo", "kill", "-CONT", mount_b_pid]) | |
507 | # Is the new file visible from mount_b? (caps become invalid after session stale) | |
508 | self.mount_b.run_shell(["ls", "testdir/file2"]) | |
509 | ||
510 | def test_unmount_for_evicted_client(self): | |
511 | """Test if client hangs on unmount after evicting the client.""" | |
512 | mount_a_client_id = self.mount_a.get_global_id() | |
513 | self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) | |
514 | ||
515 | self.mount_a.umount_wait(require_clean=True, timeout=30) |