]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/cephfs/test_misc.py
bump version to 18.2.2-pve1
[ceph.git] / ceph / qa / tasks / cephfs / test_misc.py
CommitLineData
f67539c2 1from io import StringIO
7c673cae 2
7c673cae
FG
3from tasks.cephfs.fuse_mount import FuseMount
4from tasks.cephfs.cephfs_test_case import CephFSTestCase
20effc67 5from teuthology.exceptions import CommandFailedError
33c7a0ef
TL
6from textwrap import dedent
7from threading import Thread
7c673cae 8import errno
33c7a0ef 9import platform
7c673cae 10import time
d2e6a577 11import json
91327a77 12import logging
33c7a0ef 13import os
1e59de90 14import re
7c673cae 15
91327a77 16log = logging.getLogger(__name__)
31f18b77 17
7c673cae
FG
18class TestMisc(CephFSTestCase):
19 CLIENTS_REQUIRED = 2
20
f91f0fd5
TL
21 def test_statfs_on_deleted_fs(self):
22 """
23 That statfs does not cause monitors to SIGSEGV after fs deletion.
24 """
25
26 self.mount_b.umount_wait()
27 self.mount_a.run_shell_payload("stat -f .")
28 self.fs.delete_all_filesystems()
29 # This will hang either way, run in background.
30 p = self.mount_a.run_shell_payload("stat -f .", wait=False, timeout=60, check_status=False)
31 time.sleep(30)
32 self.assertFalse(p.finished)
33 # the process is stuck in uninterruptible sleep, just kill the mount
34 self.mount_a.umount_wait(force=True)
35 p.wait()
36
33c7a0ef
TL
37 def test_fuse_mount_on_already_mounted_path(self):
38 if platform.system() != "Linux":
39 self.skipTest("Require Linux platform")
40
41 if not isinstance(self.mount_a, FuseMount):
42 self.skipTest("Require FUSE client")
43
44 # Try to mount already mounted path
45 # expecting EBUSY error
46 try:
47 mount_cmd = ['sudo'] + self.mount_a._mount_bin + [self.mount_a.hostfs_mntpt]
48 self.mount_a.client_remote.run(args=mount_cmd, stderr=StringIO(),
49 stdout=StringIO(), timeout=60, omit_sudo=False)
50 except CommandFailedError as e:
51 self.assertEqual(e.exitstatus, errno.EBUSY)
52 else:
53 self.fail("Expected EBUSY")
54
7c673cae
FG
55 def test_getattr_caps(self):
56 """
57 Check if MDS recognizes the 'mask' parameter of open request.
11fdf7f2 58 The parameter allows client to request caps when opening file
7c673cae
FG
59 """
60
61 if not isinstance(self.mount_a, FuseMount):
9f95a23c 62 self.skipTest("Require FUSE client")
7c673cae
FG
63
64 # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED
65 # on lookup/open
66 self.mount_b.umount_wait()
67 self.set_conf('client', 'client debug getattr caps', 'true')
e306af50 68 self.mount_b.mount_wait()
7c673cae
FG
69
70 # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_*
71 # to mount_a
72 p = self.mount_a.open_background("testfile")
73 self.mount_b.wait_for_visible("testfile")
74
11fdf7f2 75 # this triggers a lookup request and an open request. The debug
7c673cae
FG
76 # code will check if lookup/open reply contains xattrs
77 self.mount_b.run_shell(["cat", "testfile"])
78
79 self.mount_a.kill_background(p)
80
f64942e4
AA
81 def test_root_rctime(self):
82 """
83 Check that the root inode has a non-default rctime on startup.
84 """
85
86 t = time.time()
87 rctime = self.mount_a.getfattr(".", "ceph.dir.rctime")
88 log.info("rctime = {}".format(rctime))
e306af50 89 self.assertGreaterEqual(float(rctime), t - 10)
f64942e4 90
7c673cae 91 def test_fs_new(self):
a8e16298
TL
92 self.mount_a.umount_wait()
93 self.mount_b.umount_wait()
94
7c673cae
FG
95 data_pool_name = self.fs.get_data_pool_name()
96
f67539c2 97 self.fs.fail()
7c673cae
FG
98
99 self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
100 '--yes-i-really-mean-it')
101
102 self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
103 self.fs.metadata_pool_name,
104 self.fs.metadata_pool_name,
105 '--yes-i-really-really-mean-it')
106 self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
107 self.fs.metadata_pool_name,
522d829b 108 '--pg_num_min', str(self.fs.pg_num_min))
7c673cae 109
f67539c2
TL
110 # insert a garbage object
111 self.fs.radosm(["put", "foo", "-"], stdin=StringIO("bar"))
7c673cae 112
224ce89b
WB
113 def get_pool_df(fs, name):
114 try:
115 return fs.get_pool_df(name)['objects'] > 0
9f95a23c 116 except RuntimeError:
224ce89b 117 return False
7c673cae 118
224ce89b 119 self.wait_until_true(lambda: get_pool_df(self.fs, self.fs.metadata_pool_name), timeout=30)
7c673cae
FG
120
121 try:
122 self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
123 self.fs.metadata_pool_name,
124 data_pool_name)
125 except CommandFailedError as e:
126 self.assertEqual(e.exitstatus, errno.EINVAL)
127 else:
128 raise AssertionError("Expected EINVAL")
129
130 self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
131 self.fs.metadata_pool_name,
132 data_pool_name, "--force")
133
f67539c2
TL
134 self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name)
135
7c673cae
FG
136 self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
137 '--yes-i-really-mean-it')
138
7c673cae
FG
139 self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
140 self.fs.metadata_pool_name,
141 self.fs.metadata_pool_name,
142 '--yes-i-really-really-mean-it')
143 self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
144 self.fs.metadata_pool_name,
522d829b 145 '--pg_num_min', str(self.fs.pg_num_min))
7c673cae
FG
146 self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
147 self.fs.metadata_pool_name,
1e59de90
TL
148 data_pool_name,
149 '--allow_dangerous_metadata_overlay')
7c673cae 150
91327a77
AA
151 def test_cap_revoke_nonresponder(self):
152 """
153 Check that a client is evicted if it has not responded to cap revoke
154 request for configured number of seconds.
155 """
156 session_timeout = self.fs.get_var("session_timeout")
157 eviction_timeout = session_timeout / 2.0
158
159 self.fs.mds_asok(['config', 'set', 'mds_cap_revoke_eviction_timeout',
160 str(eviction_timeout)])
161
162 cap_holder = self.mount_a.open_background()
163
164 # Wait for the file to be visible from another client, indicating
165 # that mount_a has completed its network ops
166 self.mount_b.wait_for_visible()
167
168 # Simulate client death
f67539c2 169 self.mount_a.suspend_netns()
91327a77
AA
170
171 try:
172 # The waiter should get stuck waiting for the capability
173 # held on the MDS by the now-dead client A
174 cap_waiter = self.mount_b.write_background()
175
176 a = time.time()
177 time.sleep(eviction_timeout)
178 cap_waiter.wait()
179 b = time.time()
180 cap_waited = b - a
181 log.info("cap_waiter waited {0}s".format(cap_waited))
182
183 # check if the cap is transferred before session timeout kicked in.
184 # this is a good enough check to ensure that the client got evicted
185 # by the cap auto evicter rather than transitioning to stale state
186 # and then getting evicted.
187 self.assertLess(cap_waited, session_timeout,
188 "Capability handover took {0}, expected less than {1}".format(
189 cap_waited, session_timeout
190 ))
191
f67539c2
TL
192 self.assertTrue(self.mds_cluster.is_addr_blocklisted(
193 self.mount_a.get_global_addr()))
194 self.mount_a._kill_background(cap_holder)
91327a77 195 finally:
f67539c2 196 self.mount_a.resume_netns()
91327a77 197
d2e6a577
FG
198 def test_filtered_df(self):
199 pool_name = self.fs.get_data_pool_name()
200 raw_df = self.fs.get_pool_df(pool_name)
201 raw_avail = float(raw_df["max_avail"])
202 out = self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'get',
203 pool_name, 'size',
204 '-f', 'json-pretty')
9f95a23c 205 _ = json.loads(out)
d2e6a577
FG
206
207 proc = self.mount_a.run_shell(['df', '.'])
208 output = proc.stdout.getvalue()
209 fs_avail = output.split('\n')[1].split()[3]
210 fs_avail = float(fs_avail) * 1024
211
181888fb 212 ratio = raw_avail / fs_avail
d2e6a577 213 assert 0.9 < ratio < 1.1
f64942e4 214
11fdf7f2
TL
215 def test_dump_inode(self):
216 info = self.fs.mds_asok(['dump', 'inode', '1'])
217 assert(info['path'] == "/")
f64942e4 218
11fdf7f2
TL
219 def test_dump_inode_hexademical(self):
220 self.mount_a.run_shell(["mkdir", "-p", "foo"])
221 ino = self.mount_a.path_to_ino("foo")
222 assert type(ino) is int
223 info = self.fs.mds_asok(['dump', 'inode', hex(ino)])
224 assert info['path'] == "/foo"
f64942e4 225
20effc67
TL
226 def test_fs_lsflags(self):
227 """
228 Check that the lsflags displays the default state and the new state of flags
229 """
230 # Set some flags
231 self.fs.set_joinable(False)
232 self.fs.set_allow_new_snaps(False)
233 self.fs.set_allow_standby_replay(True)
234
235 lsflags = json.loads(self.fs.mon_manager.raw_cluster_cmd('fs', 'lsflags',
236 self.fs.name,
237 "--format=json-pretty"))
238 self.assertEqual(lsflags["joinable"], False)
239 self.assertEqual(lsflags["allow_snaps"], False)
240 self.assertEqual(lsflags["allow_multimds_snaps"], True)
241 self.assertEqual(lsflags["allow_standby_replay"], True)
f64942e4 242
33c7a0ef
TL
243 def _test_sync_stuck_for_around_5s(self, dir_path, file_sync=False):
244 self.mount_a.run_shell(["mkdir", dir_path])
245
246 sync_dir_pyscript = dedent("""
247 import os
248
249 path = "{path}"
250 dfd = os.open(path, os.O_DIRECTORY)
251 os.fsync(dfd)
252 os.close(dfd)
253 """.format(path=dir_path))
254
255 # run create/delete directories and test the sync time duration
256 for i in range(300):
257 for j in range(5):
258 self.mount_a.run_shell(["mkdir", os.path.join(dir_path, f"{i}_{j}")])
259 start = time.time()
260 if file_sync:
261 self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript])
262 else:
263 self.mount_a.run_shell(["sync"])
264 duration = time.time() - start
265 log.info(f"sync mkdir i = {i}, duration = {duration}")
266 self.assertLess(duration, 4)
267
268 for j in range(5):
269 self.mount_a.run_shell(["rm", "-rf", os.path.join(dir_path, f"{i}_{j}")])
270 start = time.time()
271 if file_sync:
272 self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript])
273 else:
274 self.mount_a.run_shell(["sync"])
275 duration = time.time() - start
276 log.info(f"sync rmdir i = {i}, duration = {duration}")
277 self.assertLess(duration, 4)
278
279 self.mount_a.run_shell(["rm", "-rf", dir_path])
280
281 def test_filesystem_sync_stuck_for_around_5s(self):
282 """
283 To check whether the fsync will be stuck to wait for the mdlog to be
284 flushed for at most 5 seconds.
285 """
286
287 dir_path = "filesystem_sync_do_not_wait_mdlog_testdir"
288 self._test_sync_stuck_for_around_5s(dir_path)
289
290 def test_file_sync_stuck_for_around_5s(self):
291 """
292 To check whether the filesystem sync will be stuck to wait for the
293 mdlog to be flushed for at most 5 seconds.
294 """
295
296 dir_path = "file_sync_do_not_wait_mdlog_testdir"
297 self._test_sync_stuck_for_around_5s(dir_path, True)
298
299 def test_file_filesystem_sync_crash(self):
300 """
301 To check whether the kernel crashes when doing the file/filesystem sync.
302 """
303
304 stop_thread = False
305 dir_path = "file_filesystem_sync_crash_testdir"
306 self.mount_a.run_shell(["mkdir", dir_path])
307
308 def mkdir_rmdir_thread(mount, path):
309 #global stop_thread
310
311 log.info(" mkdir_rmdir_thread starting...")
312 num = 0
313 while not stop_thread:
314 n = num
315 m = num
316 for __ in range(10):
317 mount.run_shell(["mkdir", os.path.join(path, f"{n}")])
318 n += 1
319 for __ in range(10):
320 mount.run_shell(["rm", "-rf", os.path.join(path, f"{m}")])
321 m += 1
322 num += 10
323 log.info(" mkdir_rmdir_thread stopped")
324
325 def filesystem_sync_thread(mount, path):
326 #global stop_thread
327
328 log.info(" filesystem_sync_thread starting...")
329 while not stop_thread:
330 mount.run_shell(["sync"])
331 log.info(" filesystem_sync_thread stopped")
332
333 def file_sync_thread(mount, path):
334 #global stop_thread
335
336 log.info(" file_sync_thread starting...")
337 pyscript = dedent("""
338 import os
339
340 path = "{path}"
341 dfd = os.open(path, os.O_DIRECTORY)
342 os.fsync(dfd)
343 os.close(dfd)
344 """.format(path=path))
345
346 while not stop_thread:
347 mount.run_shell(['python3', '-c', pyscript])
348 log.info(" file_sync_thread stopped")
349
350 td1 = Thread(target=mkdir_rmdir_thread, args=(self.mount_a, dir_path,))
351 td2 = Thread(target=filesystem_sync_thread, args=(self.mount_a, dir_path,))
352 td3 = Thread(target=file_sync_thread, args=(self.mount_a, dir_path,))
353
354 td1.start()
355 td2.start()
356 td3.start()
357 time.sleep(1200) # run 20 minutes
358 stop_thread = True
359 td1.join()
360 td2.join()
361 td3.join()
362 self.mount_a.run_shell(["rm", "-rf", dir_path])
363
1e59de90
TL
364 def test_dump_inmemory_log_on_client_eviction(self):
365 """
366 That the in-memory logs are dumped during a client eviction event.
367 """
368 self.fs.mds_asok(['config', 'set', 'debug_mds', '1/10'])
369 self.fs.mds_asok(['config', 'set', 'mds_extraordinary_events_dump_interval', '1'])
370 mount_a_client_id = self.mount_a.get_global_id()
371 infos = self.fs.status().get_ranks(self.fs.id)
372
373 #evict the client
374 self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
375 time.sleep(10) #wait for 10 seconds for the logs dumping to complete.
376
377 #The client is evicted, so unmount it.
378 try:
379 self.mount_a.umount_wait(require_clean=True, timeout=30)
380 except:
381 pass #continue with grepping the log
382
383 eviction_log = f"Evicting (\(and blocklisting\) )?client session {mount_a_client_id} \(.+:.+/.+\)"
384 search_range = "/^--- begin dump of recent events ---$/,/^--- end dump of recent events ---$/p"
385 for info in infos:
386 mds_id = info['name']
387 try:
388 remote = self.fs.mon_manager.find_remote('mds', mds_id)
389 out = remote.run(args=["sed",
390 "-n",
391 "{0}".format(search_range),
392 f"/var/log/ceph/{self.mount_a.cluster_name}-mds.{mds_id}.log"],
393 stdout=StringIO(), timeout=30)
394 except:
395 continue #continue with the next info
396 if out.stdout and re.search(eviction_log, out.stdout.getvalue().strip()):
397 return
398 self.assertTrue(False, "Failed to dump in-memory logs during client eviction")
399
400 def test_dump_inmemory_log_on_missed_beacon_ack_from_monitors(self):
401 """
402 That the in-memory logs are dumped when the mds misses beacon ACKs from monitors.
403 """
404 self.fs.mds_asok(['config', 'set', 'debug_mds', '1/10'])
405 self.fs.mds_asok(['config', 'set', 'mds_extraordinary_events_dump_interval', '1'])
406 try:
407 mons = json.loads(self.fs.mon_manager.raw_cluster_cmd('mon', 'dump', '-f', 'json'))['mons']
408 except:
409 self.assertTrue(False, "Error fetching monitors")
410
411 #Freeze all monitors
412 for mon in mons:
413 mon_name = mon['name']
414 log.info(f'Sending STOP to mon {mon_name}')
415 self.fs.mon_manager.signal_mon(mon_name, 19)
416
417 time.sleep(10) #wait for 10 seconds to get the in-memory logs dumped
418
419 #Unfreeze all monitors
420 for mon in mons:
421 mon_name = mon['name']
422 log.info(f'Sending CONT to mon {mon_name}')
423 self.fs.mon_manager.signal_mon(mon_name, 18)
424
425 missed_beacon_ack_log = "missed beacon ack from the monitors"
426 search_range = "/^--- begin dump of recent events ---$/,/^--- end dump of recent events ---$/p"
427 for info in self.fs.status().get_ranks(self.fs.id):
428 mds_id = info['name']
429 try:
430 remote = self.fs.mon_manager.find_remote('mds', mds_id)
431 out = remote.run(args=["sed",
432 "-n",
433 "{0}".format(search_range),
434 f"/var/log/ceph/{self.mount_a.cluster_name}-mds.{mds_id}.log"],
435 stdout=StringIO(), timeout=30)
436 except:
437 continue #continue with the next info
438 if out.stdout and (missed_beacon_ack_log in out.stdout.getvalue().strip()):
439 return
440 self.assertTrue(False, "Failed to dump in-memory logs during missed beacon ack")
441
442 def test_dump_inmemory_log_on_missed_internal_heartbeats(self):
443 """
444 That the in-memory logs are dumped when the mds misses internal heartbeats.
445 """
446 self.fs.mds_asok(['config', 'set', 'debug_mds', '1/10'])
447 self.fs.mds_asok(['config', 'set', 'mds_heartbeat_grace', '1'])
448 self.fs.mds_asok(['config', 'set', 'mds_extraordinary_events_dump_interval', '1'])
449 try:
450 mons = json.loads(self.fs.mon_manager.raw_cluster_cmd('mon', 'dump', '-f', 'json'))['mons']
451 except:
452 self.assertTrue(False, "Error fetching monitors")
453
454 #Freeze all monitors
455 for mon in mons:
456 mon_name = mon['name']
457 log.info(f'Sending STOP to mon {mon_name}')
458 self.fs.mon_manager.signal_mon(mon_name, 19)
459
460 time.sleep(10) #wait for 10 seconds to get the in-memory logs dumped
461
462 #Unfreeze all monitors
463 for mon in mons:
464 mon_name = mon['name']
465 log.info(f'Sending CONT to mon {mon_name}')
466 self.fs.mon_manager.signal_mon(mon_name, 18)
467
468 missed_internal_heartbeat_log = \
469 "Skipping beacon heartbeat to monitors \(last acked .+s ago\); MDS internal heartbeat is not healthy!"
470 search_range = "/^--- begin dump of recent events ---$/,/^--- end dump of recent events ---$/p"
471 for info in self.fs.status().get_ranks(self.fs.id):
472 mds_id = info['name']
473 try:
474 remote = self.fs.mon_manager.find_remote('mds', mds_id)
475 out = remote.run(args=["sed",
476 "-n",
477 "{0}".format(search_range),
478 f"/var/log/ceph/{self.mount_a.cluster_name}-mds.{mds_id}.log"],
479 stdout=StringIO(), timeout=30)
480 except:
481 continue #continue with the next info
482 if out.stdout and re.search(missed_internal_heartbeat_log, out.stdout.getvalue().strip()):
483 return
484 self.assertTrue(False, "Failed to dump in-memory logs during missed internal heartbeat")
33c7a0ef 485
aee94f69
TL
486 def _session_client_ls(self, cmd):
487 mount_a_client_id = self.mount_a.get_global_id()
488 info = self.fs.rank_asok(cmd)
489 mount_a_mountpoint = self.mount_a.mountpoint
490 mount_b_mountpoint = self.mount_b.mountpoint
491 self.assertIsNotNone(info)
492 for i in range(0, len(info)):
493 self.assertIn(info[i]["client_metadata"]["mount_point"],
494 [mount_a_mountpoint, mount_b_mountpoint])
495 info = self.fs.rank_asok(cmd + [f"id={mount_a_client_id}"])
496 self.assertEqual(len(info), 1)
497 self.assertEqual(info[0]["id"], mount_a_client_id)
498 self.assertEqual(info[0]["client_metadata"]["mount_point"], mount_a_mountpoint)
499 info = self.fs.rank_asok(cmd + ['--cap_dump'])
500 for i in range(0, len(info)):
501 self.assertIn("caps", info[i])
502
503 def test_session_ls(self):
504 self._session_client_ls(['session', 'ls'])
505
506 def test_client_ls(self):
507 self._session_client_ls(['client', 'ls'])
508
11fdf7f2
TL
509class TestCacheDrop(CephFSTestCase):
510 CLIENTS_REQUIRED = 1
f64942e4 511
11fdf7f2
TL
512 def _run_drop_cache_cmd(self, timeout=None):
513 result = None
f67539c2 514 args = ["cache", "drop"]
11fdf7f2 515 if timeout is not None:
f67539c2
TL
516 args.append(str(timeout))
517 result = self.fs.rank_tell(args)
518 return result
f64942e4 519
11fdf7f2 520 def _setup(self, max_caps=20, threshold=400):
f64942e4 521 # create some files
11fdf7f2 522 self.mount_a.create_n_files("dc-dir/dc-file", 1000, sync=True)
f64942e4 523
11fdf7f2
TL
524 # Reduce this so the MDS doesn't rkcall the maximum for simple tests
525 self.fs.rank_asok(['config', 'set', 'mds_recall_max_caps', str(max_caps)])
526 self.fs.rank_asok(['config', 'set', 'mds_recall_max_decay_threshold', str(threshold)])
f64942e4 527
11fdf7f2 528 def test_drop_cache_command(self):
f64942e4 529 """
11fdf7f2
TL
530 Basic test for checking drop cache command.
531 Confirm it halts without a timeout.
f64942e4
AA
532 Note that the cache size post trimming is not checked here.
533 """
11fdf7f2
TL
534 mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
535 self._setup()
536 result = self._run_drop_cache_cmd()
92f5a8d4
TL
537 self.assertEqual(result['client_recall']['return_code'], 0)
538 self.assertEqual(result['flush_journal']['return_code'], 0)
11fdf7f2 539 # It should take at least 1 second
92f5a8d4 540 self.assertGreater(result['duration'], 1)
11fdf7f2
TL
541 self.assertGreaterEqual(result['trim_cache']['trimmed'], 1000-2*mds_min_caps_per_client)
542
543 def test_drop_cache_command_timeout(self):
f64942e4 544 """
11fdf7f2
TL
545 Basic test for checking drop cache command.
546 Confirm recall halts early via a timeout.
f64942e4
AA
547 Note that the cache size post trimming is not checked here.
548 """
11fdf7f2
TL
549 self._setup()
550 result = self._run_drop_cache_cmd(timeout=10)
92f5a8d4
TL
551 self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
552 self.assertEqual(result['flush_journal']['return_code'], 0)
553 self.assertGreater(result['duration'], 10)
11fdf7f2
TL
554 self.assertGreaterEqual(result['trim_cache']['trimmed'], 100) # we did something, right?
555
556 def test_drop_cache_command_dead_timeout(self):
f64942e4 557 """
11fdf7f2
TL
558 Check drop cache command with non-responding client using tell
559 interface. Note that the cache size post trimming is not checked
560 here.
f64942e4 561 """
11fdf7f2 562 self._setup()
f67539c2 563 self.mount_a.suspend_netns()
11fdf7f2
TL
564 # Note: recall is subject to the timeout. The journal flush will
565 # be delayed due to the client being dead.
566 result = self._run_drop_cache_cmd(timeout=5)
92f5a8d4
TL
567 self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
568 self.assertEqual(result['flush_journal']['return_code'], 0)
569 self.assertGreater(result['duration'], 5)
570 self.assertLess(result['duration'], 120)
571 # Note: result['trim_cache']['trimmed'] may be >0 because dropping the
572 # cache now causes the Locker to drive eviction of stale clients (a
573 # stale session will be autoclosed at mdsmap['session_timeout']). The
574 # particular operation causing this is journal flush which causes the
575 # MDS to wait wait for cap revoke.
576 #self.assertEqual(0, result['trim_cache']['trimmed'])
f67539c2 577 self.mount_a.resume_netns()
f64942e4 578
11fdf7f2 579 def test_drop_cache_command_dead(self):
f64942e4
AA
580 """
581 Check drop cache command with non-responding client using tell
582 interface. Note that the cache size post trimming is not checked
583 here.
584 """
11fdf7f2 585 self._setup()
f67539c2 586 self.mount_a.suspend_netns()
11fdf7f2 587 result = self._run_drop_cache_cmd()
92f5a8d4
TL
588 self.assertEqual(result['client_recall']['return_code'], 0)
589 self.assertEqual(result['flush_journal']['return_code'], 0)
590 self.assertGreater(result['duration'], 5)
591 self.assertLess(result['duration'], 120)
592 # Note: result['trim_cache']['trimmed'] may be >0 because dropping the
593 # cache now causes the Locker to drive eviction of stale clients (a
594 # stale session will be autoclosed at mdsmap['session_timeout']). The
595 # particular operation causing this is journal flush which causes the
596 # MDS to wait wait for cap revoke.
f67539c2 597 self.mount_a.resume_netns()
05a536ef
TL
598
599class TestSkipReplayInoTable(CephFSTestCase):
600 MDSS_REQUIRED = 1
601 CLIENTS_REQUIRED = 1
602
603 def test_alloc_cinode_assert(self):
604 """
605 Test alloc CInode assert.
606
607 See: https://tracker.ceph.com/issues/52280
608 """
609
610 # Create a directory and the mds will journal this and then crash
611 self.mount_a.run_shell(["rm", "-rf", "test_alloc_ino"])
612 self.mount_a.run_shell(["mkdir", "test_alloc_ino"])
613
614 status = self.fs.status()
615 rank0 = self.fs.get_rank(rank=0, status=status)
616
617 self.fs.mds_asok(['config', 'set', 'mds_kill_skip_replaying_inotable', "true"])
618 # This will make the MDS crash, since we only have one MDS in the
619 # cluster and without the "wait=False" it will stuck here forever.
620 self.mount_a.run_shell(["mkdir", "test_alloc_ino/dir1"], wait=False)
621
622 # sleep 10 seconds to make sure the journal logs are flushed and
623 # the mds crashes
624 time.sleep(10)
625
626 # Now set the mds config to skip replaying the inotable
627 self.fs.set_ceph_conf('mds', 'mds_inject_skip_replaying_inotable', True)
628 self.fs.set_ceph_conf('mds', 'mds_wipe_sessions', True)
629
630 self.fs.mds_restart()
631 # sleep 5 seconds to make sure the mds tell command won't stuck
632 time.sleep(5)
633 self.fs.wait_for_daemons()
634
635 self.delete_mds_coredump(rank0['name']);
636
637 self.mount_a.run_shell(["mkdir", "test_alloc_ino/dir2"])
638
639 ls_out = set(self.mount_a.ls("test_alloc_ino/"))
640 self.assertEqual(ls_out, set({"dir1", "dir2"}))