]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | |
2 | """ | |
3 | Teuthology task for exercising CephFS client recovery | |
4 | """ | |
5 | ||
6 | import logging | |
7 | from textwrap import dedent | |
8 | import time | |
9 | import distutils.version as version | |
10 | import re | |
11 | import os | |
12 | ||
13 | from teuthology.orchestra.run import CommandFailedError, ConnectionLostError | |
28e407b8 | 14 | from tasks.cephfs.fuse_mount import FuseMount |
7c673cae FG |
15 | from tasks.cephfs.cephfs_test_case import CephFSTestCase |
16 | from teuthology.packaging import get_package_version | |
28e407b8 | 17 | from unittest import SkipTest |
7c673cae FG |
18 | |
19 | ||
20 | log = logging.getLogger(__name__) | |
21 | ||
22 | ||
23 | # Arbitrary timeouts for operations involving restarting | |
24 | # an MDS or waiting for it to come up | |
25 | MDS_RESTART_GRACE = 60 | |
26 | ||
27 | ||
28 | class TestClientNetworkRecovery(CephFSTestCase): | |
29 | REQUIRE_KCLIENT_REMOTE = True | |
30 | REQUIRE_ONE_CLIENT_REMOTE = True | |
31 | CLIENTS_REQUIRED = 2 | |
32 | ||
f64942e4 | 33 | LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"] |
7c673cae FG |
34 | |
35 | # Environment references | |
7c673cae FG |
36 | mds_reconnect_timeout = None |
37 | ms_max_backoff = None | |
38 | ||
39 | def test_network_death(self): | |
40 | """ | |
41 | Simulate software freeze or temporary network failure. | |
42 | ||
43 | Check that the client blocks I/O during failure, and completes | |
44 | I/O after failure. | |
45 | """ | |
46 | ||
f64942e4 | 47 | session_timeout = self.fs.get_var("session_timeout") |
494da23a | 48 | self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false']) |
f64942e4 | 49 | |
7c673cae FG |
50 | # We only need one client |
51 | self.mount_b.umount_wait() | |
52 | ||
53 | # Initially our one client session should be visible | |
54 | client_id = self.mount_a.get_global_id() | |
55 | ls_data = self._session_list() | |
56 | self.assert_session_count(1, ls_data) | |
57 | self.assertEqual(ls_data[0]['id'], client_id) | |
58 | self.assert_session_state(client_id, "open") | |
59 | ||
60 | # ...and capable of doing I/O without blocking | |
61 | self.mount_a.create_files() | |
62 | ||
63 | # ...but if we turn off the network | |
64 | self.fs.set_clients_block(True) | |
65 | ||
66 | # ...and try and start an I/O | |
67 | write_blocked = self.mount_a.write_background() | |
68 | ||
69 | # ...then it should block | |
70 | self.assertFalse(write_blocked.finished) | |
71 | self.assert_session_state(client_id, "open") | |
f64942e4 | 72 | time.sleep(session_timeout * 1.5) # Long enough for MDS to consider session stale |
7c673cae FG |
73 | self.assertFalse(write_blocked.finished) |
74 | self.assert_session_state(client_id, "stale") | |
75 | ||
76 | # ...until we re-enable I/O | |
77 | self.fs.set_clients_block(False) | |
78 | ||
79 | # ...when it should complete promptly | |
80 | a = time.time() | |
81 | self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2) | |
82 | write_blocked.wait() # Already know we're finished, wait() to raise exception on errors | |
83 | recovery_time = time.time() - a | |
84 | log.info("recovery time: {0}".format(recovery_time)) | |
85 | self.assert_session_state(client_id, "open") | |
86 | ||
87 | ||
88 | class TestClientRecovery(CephFSTestCase): | |
89 | REQUIRE_KCLIENT_REMOTE = True | |
90 | CLIENTS_REQUIRED = 2 | |
91 | ||
f64942e4 | 92 | LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"] |
7c673cae FG |
93 | |
94 | # Environment references | |
7c673cae FG |
95 | mds_reconnect_timeout = None |
96 | ms_max_backoff = None | |
97 | ||
98 | def test_basic(self): | |
99 | # Check that two clients come up healthy and see each others' files | |
100 | # ===================================================== | |
101 | self.mount_a.create_files() | |
102 | self.mount_a.check_files() | |
103 | self.mount_a.umount_wait() | |
104 | ||
105 | self.mount_b.check_files() | |
106 | ||
107 | self.mount_a.mount() | |
108 | self.mount_a.wait_until_mounted() | |
109 | ||
110 | # Check that the admin socket interface is correctly reporting | |
111 | # two sessions | |
112 | # ===================================================== | |
113 | ls_data = self._session_list() | |
114 | self.assert_session_count(2, ls_data) | |
115 | ||
116 | self.assertSetEqual( | |
117 | set([l['id'] for l in ls_data]), | |
118 | {self.mount_a.get_global_id(), self.mount_b.get_global_id()} | |
119 | ) | |
120 | ||
121 | def test_restart(self): | |
122 | # Check that after an MDS restart both clients reconnect and continue | |
123 | # to handle I/O | |
124 | # ===================================================== | |
125 | self.fs.mds_fail_restart() | |
126 | self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) | |
127 | ||
128 | self.mount_a.create_destroy() | |
129 | self.mount_b.create_destroy() | |
130 | ||
131 | def _session_num_caps(self, client_id): | |
132 | ls_data = self.fs.mds_asok(['session', 'ls']) | |
133 | return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps']) | |
134 | ||
135 | def test_reconnect_timeout(self): | |
136 | # Reconnect timeout | |
137 | # ================= | |
138 | # Check that if I stop an MDS and a client goes away, the MDS waits | |
139 | # for the reconnect period | |
140 | self.fs.mds_stop() | |
141 | self.fs.mds_fail() | |
142 | ||
143 | mount_a_client_id = self.mount_a.get_global_id() | |
144 | self.mount_a.umount_wait(force=True) | |
145 | ||
146 | self.fs.mds_restart() | |
147 | ||
148 | self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) | |
149 | # Check that the MDS locally reports its state correctly | |
150 | status = self.fs.mds_asok(['status']) | |
151 | self.assertIn("reconnect_status", status) | |
152 | ||
153 | ls_data = self._session_list() | |
154 | self.assert_session_count(2, ls_data) | |
155 | ||
156 | # The session for the dead client should have the 'reconnect' flag set | |
157 | self.assertTrue(self.get_session(mount_a_client_id)['reconnecting']) | |
158 | ||
159 | # Wait for the reconnect state to clear, this should take the | |
160 | # reconnect timeout period. | |
161 | in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2) | |
162 | # Check that the period we waited to enter active is within a factor | |
163 | # of two of the reconnect timeout. | |
164 | self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout / 2, | |
165 | "Should have been in reconnect phase for {0} but only took {1}".format( | |
166 | self.mds_reconnect_timeout, in_reconnect_for | |
167 | )) | |
168 | ||
169 | self.assert_session_count(1) | |
170 | ||
171 | # Check that the client that timed out during reconnect can | |
172 | # mount again and do I/O | |
173 | self.mount_a.mount() | |
174 | self.mount_a.wait_until_mounted() | |
175 | self.mount_a.create_destroy() | |
176 | ||
177 | self.assert_session_count(2) | |
178 | ||
179 | def test_reconnect_eviction(self): | |
180 | # Eviction during reconnect | |
181 | # ========================= | |
182 | mount_a_client_id = self.mount_a.get_global_id() | |
183 | ||
184 | self.fs.mds_stop() | |
185 | self.fs.mds_fail() | |
186 | ||
187 | # The mount goes away while the MDS is offline | |
188 | self.mount_a.kill() | |
189 | ||
190 | self.fs.mds_restart() | |
191 | ||
192 | # Enter reconnect phase | |
193 | self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) | |
194 | self.assert_session_count(2) | |
195 | ||
196 | # Evict the stuck client | |
197 | self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) | |
198 | self.assert_session_count(1) | |
199 | ||
200 | # Observe that we proceed to active phase without waiting full reconnect timeout | |
201 | evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) | |
202 | # Once we evict the troublemaker, the reconnect phase should complete | |
203 | # in well under the reconnect timeout. | |
204 | self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5, | |
205 | "reconnect did not complete soon enough after eviction, took {0}".format( | |
206 | evict_til_active | |
207 | )) | |
208 | ||
209 | # We killed earlier so must clean up before trying to use again | |
210 | self.mount_a.kill_cleanup() | |
211 | ||
212 | # Bring the client back | |
213 | self.mount_a.mount() | |
214 | self.mount_a.wait_until_mounted() | |
215 | self.mount_a.create_destroy() | |
216 | ||
494da23a | 217 | def _test_stale_caps(self, write): |
f64942e4 AA |
218 | session_timeout = self.fs.get_var("session_timeout") |
219 | ||
7c673cae FG |
220 | # Capability release from stale session |
221 | # ===================================== | |
494da23a TL |
222 | if write: |
223 | cap_holder = self.mount_a.open_background() | |
224 | else: | |
225 | self.mount_a.run_shell(["touch", "background_file"]) | |
226 | self.mount_a.umount_wait() | |
227 | self.mount_a.mount() | |
228 | self.mount_a.wait_until_mounted() | |
229 | cap_holder = self.mount_a.open_background(write=False) | |
230 | ||
231 | self.assert_session_count(2) | |
232 | mount_a_gid = self.mount_a.get_global_id() | |
7c673cae FG |
233 | |
234 | # Wait for the file to be visible from another client, indicating | |
235 | # that mount_a has completed its network ops | |
236 | self.mount_b.wait_for_visible() | |
237 | ||
238 | # Simulate client death | |
239 | self.mount_a.kill() | |
240 | ||
eafe8130 TL |
241 | # wait for it to die so it doesn't voluntarily release buffer cap |
242 | time.sleep(5) | |
243 | ||
7c673cae | 244 | try: |
f64942e4 | 245 | # Now, after session_timeout seconds, the waiter should |
7c673cae FG |
246 | # complete their operation when the MDS marks the holder's |
247 | # session stale. | |
248 | cap_waiter = self.mount_b.write_background() | |
249 | a = time.time() | |
250 | cap_waiter.wait() | |
251 | b = time.time() | |
252 | ||
253 | # Should have succeeded | |
254 | self.assertEqual(cap_waiter.exitstatus, 0) | |
255 | ||
494da23a TL |
256 | if write: |
257 | self.assert_session_count(1) | |
258 | else: | |
259 | self.assert_session_state(mount_a_gid, "stale") | |
260 | ||
7c673cae FG |
261 | cap_waited = b - a |
262 | log.info("cap_waiter waited {0}s".format(cap_waited)) | |
f64942e4 | 263 | self.assertTrue(session_timeout / 2.0 <= cap_waited <= session_timeout * 2.0, |
7c673cae | 264 | "Capability handover took {0}, expected approx {1}".format( |
f64942e4 | 265 | cap_waited, session_timeout |
7c673cae FG |
266 | )) |
267 | ||
268 | cap_holder.stdin.close() | |
269 | try: | |
270 | cap_holder.wait() | |
271 | except (CommandFailedError, ConnectionLostError): | |
272 | # We killed it (and possibly its node), so it raises an error | |
273 | pass | |
274 | finally: | |
275 | # teardown() doesn't quite handle this case cleanly, so help it out | |
276 | self.mount_a.kill_cleanup() | |
277 | ||
278 | self.mount_a.mount() | |
279 | self.mount_a.wait_until_mounted() | |
280 | ||
494da23a TL |
281 | def test_stale_read_caps(self): |
282 | self._test_stale_caps(False) | |
283 | ||
284 | def test_stale_write_caps(self): | |
285 | self._test_stale_caps(True) | |
286 | ||
7c673cae FG |
287 | def test_evicted_caps(self): |
288 | # Eviction while holding a capability | |
289 | # =================================== | |
290 | ||
f64942e4 AA |
291 | session_timeout = self.fs.get_var("session_timeout") |
292 | ||
7c673cae FG |
293 | # Take out a write capability on a file on client A, |
294 | # and then immediately kill it. | |
295 | cap_holder = self.mount_a.open_background() | |
296 | mount_a_client_id = self.mount_a.get_global_id() | |
297 | ||
298 | # Wait for the file to be visible from another client, indicating | |
299 | # that mount_a has completed its network ops | |
300 | self.mount_b.wait_for_visible() | |
301 | ||
302 | # Simulate client death | |
303 | self.mount_a.kill() | |
304 | ||
eafe8130 TL |
305 | # wait for it to die so it doesn't voluntarily release buffer cap |
306 | time.sleep(5) | |
307 | ||
7c673cae FG |
308 | try: |
309 | # The waiter should get stuck waiting for the capability | |
310 | # held on the MDS by the now-dead client A | |
311 | cap_waiter = self.mount_b.write_background() | |
312 | time.sleep(5) | |
313 | self.assertFalse(cap_waiter.finished) | |
314 | ||
315 | self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) | |
316 | # Now, because I evicted the old holder of the capability, it should | |
317 | # immediately get handed over to the waiter | |
318 | a = time.time() | |
319 | cap_waiter.wait() | |
320 | b = time.time() | |
321 | cap_waited = b - a | |
322 | log.info("cap_waiter waited {0}s".format(cap_waited)) | |
323 | # This is the check that it happened 'now' rather than waiting | |
324 | # for the session timeout | |
f64942e4 | 325 | self.assertLess(cap_waited, session_timeout / 2.0, |
7c673cae | 326 | "Capability handover took {0}, expected less than {1}".format( |
f64942e4 | 327 | cap_waited, session_timeout / 2.0 |
7c673cae FG |
328 | )) |
329 | ||
330 | cap_holder.stdin.close() | |
331 | try: | |
332 | cap_holder.wait() | |
333 | except (CommandFailedError, ConnectionLostError): | |
334 | # We killed it (and possibly its node), so it raises an error | |
335 | pass | |
336 | finally: | |
337 | self.mount_a.kill_cleanup() | |
338 | ||
339 | self.mount_a.mount() | |
340 | self.mount_a.wait_until_mounted() | |
341 | ||
342 | def test_trim_caps(self): | |
343 | # Trim capability when reconnecting MDS | |
344 | # =================================== | |
345 | ||
346 | count = 500 | |
347 | # Create lots of files | |
348 | for i in range(count): | |
349 | self.mount_a.run_shell(["touch", "f{0}".format(i)]) | |
350 | ||
351 | # Populate mount_b's cache | |
31f18b77 | 352 | self.mount_b.run_shell(["ls", "-l"]) |
7c673cae FG |
353 | |
354 | client_id = self.mount_b.get_global_id() | |
355 | num_caps = self._session_num_caps(client_id) | |
356 | self.assertGreaterEqual(num_caps, count) | |
357 | ||
358 | # Restart MDS. client should trim its cache when reconnecting to the MDS | |
359 | self.fs.mds_fail_restart() | |
360 | self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) | |
361 | ||
362 | num_caps = self._session_num_caps(client_id) | |
363 | self.assertLess(num_caps, count, | |
364 | "should have less than {0} capabilities, have {1}".format( | |
365 | count, num_caps | |
366 | )) | |
367 | ||
31f18b77 | 368 | def _is_flockable(self): |
7c673cae FG |
369 | a_version_str = get_package_version(self.mount_a.client_remote, "fuse") |
370 | b_version_str = get_package_version(self.mount_b.client_remote, "fuse") | |
371 | flock_version_str = "2.9" | |
372 | ||
373 | version_regex = re.compile(r"[0-9\.]+") | |
374 | a_result = version_regex.match(a_version_str) | |
375 | self.assertTrue(a_result) | |
376 | b_result = version_regex.match(b_version_str) | |
377 | self.assertTrue(b_result) | |
378 | a_version = version.StrictVersion(a_result.group()) | |
379 | b_version = version.StrictVersion(b_result.group()) | |
380 | flock_version=version.StrictVersion(flock_version_str) | |
381 | ||
7c673cae | 382 | if (a_version >= flock_version and b_version >= flock_version): |
31f18b77 FG |
383 | log.info("flock locks are available") |
384 | return True | |
7c673cae FG |
385 | else: |
386 | log.info("not testing flock locks, machines have versions {av} and {bv}".format( | |
387 | av=a_version_str,bv=b_version_str)) | |
31f18b77 FG |
388 | return False |
389 | ||
390 | def test_filelock(self): | |
391 | """ | |
392 | Check that file lock doesn't get lost after an MDS restart | |
393 | """ | |
7c673cae | 394 | |
31f18b77 | 395 | flockable = self._is_flockable() |
7c673cae FG |
396 | lock_holder = self.mount_a.lock_background(do_flock=flockable) |
397 | ||
398 | self.mount_b.wait_for_visible("background_file-2") | |
399 | self.mount_b.check_filelock(do_flock=flockable) | |
400 | ||
401 | self.fs.mds_fail_restart() | |
402 | self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) | |
403 | ||
404 | self.mount_b.check_filelock(do_flock=flockable) | |
405 | ||
406 | # Tear down the background process | |
407 | lock_holder.stdin.close() | |
408 | try: | |
409 | lock_holder.wait() | |
410 | except (CommandFailedError, ConnectionLostError): | |
411 | # We killed it, so it raises an error | |
412 | pass | |
413 | ||
31f18b77 FG |
414 | def test_filelock_eviction(self): |
415 | """ | |
416 | Check that file lock held by evicted client is given to | |
417 | waiting client. | |
418 | """ | |
419 | if not self._is_flockable(): | |
420 | self.skipTest("flock is not available") | |
421 | ||
422 | lock_holder = self.mount_a.lock_background() | |
423 | self.mount_b.wait_for_visible("background_file-2") | |
424 | self.mount_b.check_filelock() | |
425 | ||
426 | lock_taker = self.mount_b.lock_and_release() | |
427 | # Check the taker is waiting (doesn't get it immediately) | |
428 | time.sleep(2) | |
429 | self.assertFalse(lock_holder.finished) | |
430 | self.assertFalse(lock_taker.finished) | |
431 | ||
181888fb FG |
432 | try: |
433 | mount_a_client_id = self.mount_a.get_global_id() | |
434 | self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) | |
31f18b77 | 435 | |
181888fb FG |
436 | # Evicting mount_a should let mount_b's attempt to take the lock |
437 | # succeed | |
438 | self.wait_until_true(lambda: lock_taker.finished, timeout=10) | |
439 | finally: | |
440 | # teardown() doesn't quite handle this case cleanly, so help it out | |
441 | self.mount_a.kill() | |
442 | self.mount_a.kill_cleanup() | |
443 | ||
444 | # Bring the client back | |
445 | self.mount_a.mount() | |
446 | self.mount_a.wait_until_mounted() | |
31f18b77 | 447 | |
7c673cae FG |
448 | def test_dir_fsync(self): |
449 | self._test_fsync(True); | |
450 | ||
451 | def test_create_fsync(self): | |
452 | self._test_fsync(False); | |
453 | ||
454 | def _test_fsync(self, dirfsync): | |
455 | """ | |
456 | That calls to fsync guarantee visibility of metadata to another | |
457 | client immediately after the fsyncing client dies. | |
458 | """ | |
459 | ||
460 | # Leave this guy out until he's needed | |
461 | self.mount_b.umount_wait() | |
462 | ||
463 | # Create dir + child dentry on client A, and fsync the dir | |
464 | path = os.path.join(self.mount_a.mountpoint, "subdir") | |
465 | self.mount_a.run_python( | |
466 | dedent(""" | |
467 | import os | |
468 | import time | |
469 | ||
470 | path = "{path}" | |
471 | ||
472 | print "Starting creation..." | |
473 | start = time.time() | |
474 | ||
475 | os.mkdir(path) | |
476 | dfd = os.open(path, os.O_DIRECTORY) | |
477 | ||
478 | fd = open(os.path.join(path, "childfile"), "w") | |
479 | print "Finished creation in {{0}}s".format(time.time() - start) | |
480 | ||
481 | print "Starting fsync..." | |
482 | start = time.time() | |
483 | if {dirfsync}: | |
484 | os.fsync(dfd) | |
485 | else: | |
486 | os.fsync(fd) | |
487 | print "Finished fsync in {{0}}s".format(time.time() - start) | |
488 | """.format(path=path,dirfsync=str(dirfsync))) | |
489 | ) | |
490 | ||
491 | # Immediately kill the MDS and then client A | |
492 | self.fs.mds_stop() | |
493 | self.fs.mds_fail() | |
494 | self.mount_a.kill() | |
495 | self.mount_a.kill_cleanup() | |
496 | ||
497 | # Restart the MDS. Wait for it to come up, it'll have to time out in clientreplay | |
498 | self.fs.mds_restart() | |
499 | log.info("Waiting for reconnect...") | |
500 | self.fs.wait_for_state("up:reconnect") | |
501 | log.info("Waiting for active...") | |
502 | self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout) | |
503 | log.info("Reached active...") | |
504 | ||
505 | # Is the child dentry visible from mount B? | |
506 | self.mount_b.mount() | |
507 | self.mount_b.wait_until_mounted() | |
508 | self.mount_b.run_shell(["ls", "subdir/childfile"]) | |
28e407b8 | 509 | |
11fdf7f2 TL |
510 | def test_unmount_for_evicted_client(self): |
511 | """Test if client hangs on unmount after evicting the client.""" | |
512 | mount_a_client_id = self.mount_a.get_global_id() | |
513 | self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) | |
514 | ||
515 | self.mount_a.umount_wait(require_clean=True, timeout=30) | |
516 | ||
28e407b8 AA |
517 | def test_stale_renew(self): |
518 | if not isinstance(self.mount_a, FuseMount): | |
519 | raise SkipTest("Require FUSE client to handle signal STOP/CONT") | |
520 | ||
f64942e4 AA |
521 | session_timeout = self.fs.get_var("session_timeout") |
522 | ||
28e407b8 AA |
523 | self.mount_a.run_shell(["mkdir", "testdir"]) |
524 | self.mount_a.run_shell(["touch", "testdir/file1"]) | |
525 | # populate readdir cache | |
526 | self.mount_a.run_shell(["ls", "testdir"]) | |
527 | self.mount_b.run_shell(["ls", "testdir"]) | |
528 | ||
529 | # check if readdir cache is effective | |
530 | initial_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency']) | |
531 | self.mount_b.run_shell(["ls", "testdir"]) | |
532 | current_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency']) | |
533 | self.assertEqual(current_readdirs, initial_readdirs); | |
534 | ||
535 | mount_b_gid = self.mount_b.get_global_id() | |
536 | mount_b_pid = self.mount_b.get_client_pid() | |
537 | # stop ceph-fuse process of mount_b | |
538 | self.mount_b.client_remote.run(args=["sudo", "kill", "-STOP", mount_b_pid]) | |
539 | ||
540 | self.assert_session_state(mount_b_gid, "open") | |
f64942e4 | 541 | time.sleep(session_timeout * 1.5) # Long enough for MDS to consider session stale |
28e407b8 AA |
542 | |
543 | self.mount_a.run_shell(["touch", "testdir/file2"]) | |
494da23a | 544 | self.assert_session_state(mount_b_gid, "stale") |
28e407b8 AA |
545 | |
546 | # resume ceph-fuse process of mount_b | |
547 | self.mount_b.client_remote.run(args=["sudo", "kill", "-CONT", mount_b_pid]) | |
548 | # Is the new file visible from mount_b? (caps become invalid after session stale) | |
549 | self.mount_b.run_shell(["ls", "testdir/file2"]) | |
550 | ||
11fdf7f2 TL |
551 | def test_abort_conn(self): |
552 | """ | |
553 | Check that abort_conn() skips closing mds sessions. | |
554 | """ | |
555 | if not isinstance(self.mount_a, FuseMount): | |
556 | raise SkipTest("Testing libcephfs function") | |
28e407b8 | 557 | |
494da23a | 558 | self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false']) |
11fdf7f2 TL |
559 | session_timeout = self.fs.get_var("session_timeout") |
560 | ||
561 | self.mount_a.umount_wait() | |
562 | self.mount_b.umount_wait() | |
563 | ||
564 | gid_str = self.mount_a.run_python(dedent(""" | |
565 | import cephfs as libcephfs | |
566 | cephfs = libcephfs.LibCephFS(conffile='') | |
567 | cephfs.mount() | |
568 | client_id = cephfs.get_instance_id() | |
569 | cephfs.abort_conn() | |
570 | print client_id | |
571 | """) | |
572 | ) | |
573 | gid = int(gid_str); | |
574 | ||
575 | self.assert_session_state(gid, "open") | |
576 | time.sleep(session_timeout * 1.5) # Long enough for MDS to consider session stale | |
577 | self.assert_session_state(gid, "stale") | |
eafe8130 TL |
578 | |
579 | def test_dont_mark_unresponsive_client_stale(self): | |
580 | """ | |
581 | Test that an unresponsive client holding caps is not marked stale or | |
582 | evicted unless another clients wants its caps. | |
583 | """ | |
584 | if not isinstance(self.mount_a, FuseMount): | |
585 | self.skipTest("Require FUSE client to handle signal STOP/CONT") | |
586 | ||
587 | # XXX: To conduct this test we need at least two clients since a | |
588 | # single client is never evcited by MDS. | |
589 | SESSION_TIMEOUT = 30 | |
590 | SESSION_AUTOCLOSE = 50 | |
591 | time_at_beg = time.time() | |
592 | mount_a_gid = self.mount_a.get_global_id() | |
593 | mount_a_pid = self.mount_a.client_pid | |
594 | self.fs.set_var('session_timeout', SESSION_TIMEOUT) | |
595 | self.fs.set_var('session_autoclose', SESSION_AUTOCLOSE) | |
596 | self.assert_session_count(2, self.fs.mds_asok(['session', 'ls'])) | |
597 | ||
598 | # test that client holding cap not required by any other client is not | |
599 | # marked stale when it becomes unresponsive. | |
600 | self.mount_a.run_shell(['mkdir', 'dir']) | |
601 | self.mount_a.send_signal('sigstop') | |
602 | time.sleep(SESSION_TIMEOUT + 2) | |
603 | self.assert_session_state(mount_a_gid, "open") | |
604 | ||
605 | # test that other clients have to wait to get the caps from | |
606 | # unresponsive client until session_autoclose. | |
607 | self.mount_b.run_shell(['stat', 'dir']) | |
608 | self.assert_session_count(1, self.fs.mds_asok(['session', 'ls'])) | |
609 | self.assertLess(time.time(), time_at_beg + SESSION_AUTOCLOSE) | |
610 | ||
611 | self.mount_a.send_signal('sigcont') |