]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/cephfs/test_misc.py
import ceph quincy 17.2.4
[ceph.git] / ceph / qa / tasks / cephfs / test_misc.py
CommitLineData
f67539c2 1from io import StringIO
7c673cae 2
7c673cae
FG
3from tasks.cephfs.fuse_mount import FuseMount
4from tasks.cephfs.cephfs_test_case import CephFSTestCase
20effc67 5from teuthology.exceptions import CommandFailedError
33c7a0ef
TL
6from textwrap import dedent
7from threading import Thread
7c673cae 8import errno
33c7a0ef 9import platform
7c673cae 10import time
d2e6a577 11import json
91327a77 12import logging
33c7a0ef 13import os
7c673cae 14
91327a77 15log = logging.getLogger(__name__)
31f18b77 16
7c673cae
FG
17class TestMisc(CephFSTestCase):
18 CLIENTS_REQUIRED = 2
19
f91f0fd5
TL
20 def test_statfs_on_deleted_fs(self):
21 """
22 That statfs does not cause monitors to SIGSEGV after fs deletion.
23 """
24
25 self.mount_b.umount_wait()
26 self.mount_a.run_shell_payload("stat -f .")
27 self.fs.delete_all_filesystems()
28 # This will hang either way, run in background.
29 p = self.mount_a.run_shell_payload("stat -f .", wait=False, timeout=60, check_status=False)
30 time.sleep(30)
31 self.assertFalse(p.finished)
32 # the process is stuck in uninterruptible sleep, just kill the mount
33 self.mount_a.umount_wait(force=True)
34 p.wait()
35
33c7a0ef
TL
36 def test_fuse_mount_on_already_mounted_path(self):
37 if platform.system() != "Linux":
38 self.skipTest("Require Linux platform")
39
40 if not isinstance(self.mount_a, FuseMount):
41 self.skipTest("Require FUSE client")
42
43 # Try to mount already mounted path
44 # expecting EBUSY error
45 try:
46 mount_cmd = ['sudo'] + self.mount_a._mount_bin + [self.mount_a.hostfs_mntpt]
47 self.mount_a.client_remote.run(args=mount_cmd, stderr=StringIO(),
48 stdout=StringIO(), timeout=60, omit_sudo=False)
49 except CommandFailedError as e:
50 self.assertEqual(e.exitstatus, errno.EBUSY)
51 else:
52 self.fail("Expected EBUSY")
53
7c673cae
FG
54 def test_getattr_caps(self):
55 """
56 Check if MDS recognizes the 'mask' parameter of open request.
11fdf7f2 57 The parameter allows client to request caps when opening file
7c673cae
FG
58 """
59
60 if not isinstance(self.mount_a, FuseMount):
9f95a23c 61 self.skipTest("Require FUSE client")
7c673cae
FG
62
63 # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED
64 # on lookup/open
65 self.mount_b.umount_wait()
66 self.set_conf('client', 'client debug getattr caps', 'true')
e306af50 67 self.mount_b.mount_wait()
7c673cae
FG
68
69 # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_*
70 # to mount_a
71 p = self.mount_a.open_background("testfile")
72 self.mount_b.wait_for_visible("testfile")
73
11fdf7f2 74 # this triggers a lookup request and an open request. The debug
7c673cae
FG
75 # code will check if lookup/open reply contains xattrs
76 self.mount_b.run_shell(["cat", "testfile"])
77
78 self.mount_a.kill_background(p)
79
f64942e4
AA
80 def test_root_rctime(self):
81 """
82 Check that the root inode has a non-default rctime on startup.
83 """
84
85 t = time.time()
86 rctime = self.mount_a.getfattr(".", "ceph.dir.rctime")
87 log.info("rctime = {}".format(rctime))
e306af50 88 self.assertGreaterEqual(float(rctime), t - 10)
f64942e4 89
7c673cae 90 def test_fs_new(self):
a8e16298
TL
91 self.mount_a.umount_wait()
92 self.mount_b.umount_wait()
93
7c673cae
FG
94 data_pool_name = self.fs.get_data_pool_name()
95
f67539c2 96 self.fs.fail()
7c673cae
FG
97
98 self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
99 '--yes-i-really-mean-it')
100
101 self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
102 self.fs.metadata_pool_name,
103 self.fs.metadata_pool_name,
104 '--yes-i-really-really-mean-it')
105 self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
106 self.fs.metadata_pool_name,
522d829b 107 '--pg_num_min', str(self.fs.pg_num_min))
7c673cae 108
f67539c2
TL
109 # insert a garbage object
110 self.fs.radosm(["put", "foo", "-"], stdin=StringIO("bar"))
7c673cae 111
224ce89b
WB
112 def get_pool_df(fs, name):
113 try:
114 return fs.get_pool_df(name)['objects'] > 0
9f95a23c 115 except RuntimeError:
224ce89b 116 return False
7c673cae 117
224ce89b 118 self.wait_until_true(lambda: get_pool_df(self.fs, self.fs.metadata_pool_name), timeout=30)
7c673cae
FG
119
120 try:
121 self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
122 self.fs.metadata_pool_name,
123 data_pool_name)
124 except CommandFailedError as e:
125 self.assertEqual(e.exitstatus, errno.EINVAL)
126 else:
127 raise AssertionError("Expected EINVAL")
128
129 self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
130 self.fs.metadata_pool_name,
131 data_pool_name, "--force")
132
f67539c2
TL
133 self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name)
134
7c673cae
FG
135 self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
136 '--yes-i-really-mean-it')
137
7c673cae
FG
138 self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
139 self.fs.metadata_pool_name,
140 self.fs.metadata_pool_name,
141 '--yes-i-really-really-mean-it')
142 self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
143 self.fs.metadata_pool_name,
522d829b 144 '--pg_num_min', str(self.fs.pg_num_min))
7c673cae
FG
145 self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
146 self.fs.metadata_pool_name,
147 data_pool_name)
148
91327a77
AA
149 def test_cap_revoke_nonresponder(self):
150 """
151 Check that a client is evicted if it has not responded to cap revoke
152 request for configured number of seconds.
153 """
154 session_timeout = self.fs.get_var("session_timeout")
155 eviction_timeout = session_timeout / 2.0
156
157 self.fs.mds_asok(['config', 'set', 'mds_cap_revoke_eviction_timeout',
158 str(eviction_timeout)])
159
160 cap_holder = self.mount_a.open_background()
161
162 # Wait for the file to be visible from another client, indicating
163 # that mount_a has completed its network ops
164 self.mount_b.wait_for_visible()
165
166 # Simulate client death
f67539c2 167 self.mount_a.suspend_netns()
91327a77
AA
168
169 try:
170 # The waiter should get stuck waiting for the capability
171 # held on the MDS by the now-dead client A
172 cap_waiter = self.mount_b.write_background()
173
174 a = time.time()
175 time.sleep(eviction_timeout)
176 cap_waiter.wait()
177 b = time.time()
178 cap_waited = b - a
179 log.info("cap_waiter waited {0}s".format(cap_waited))
180
181 # check if the cap is transferred before session timeout kicked in.
182 # this is a good enough check to ensure that the client got evicted
183 # by the cap auto evicter rather than transitioning to stale state
184 # and then getting evicted.
185 self.assertLess(cap_waited, session_timeout,
186 "Capability handover took {0}, expected less than {1}".format(
187 cap_waited, session_timeout
188 ))
189
f67539c2
TL
190 self.assertTrue(self.mds_cluster.is_addr_blocklisted(
191 self.mount_a.get_global_addr()))
192 self.mount_a._kill_background(cap_holder)
91327a77 193 finally:
f67539c2 194 self.mount_a.resume_netns()
91327a77 195
d2e6a577
FG
196 def test_filtered_df(self):
197 pool_name = self.fs.get_data_pool_name()
198 raw_df = self.fs.get_pool_df(pool_name)
199 raw_avail = float(raw_df["max_avail"])
200 out = self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'get',
201 pool_name, 'size',
202 '-f', 'json-pretty')
9f95a23c 203 _ = json.loads(out)
d2e6a577
FG
204
205 proc = self.mount_a.run_shell(['df', '.'])
206 output = proc.stdout.getvalue()
207 fs_avail = output.split('\n')[1].split()[3]
208 fs_avail = float(fs_avail) * 1024
209
181888fb 210 ratio = raw_avail / fs_avail
d2e6a577 211 assert 0.9 < ratio < 1.1
f64942e4 212
11fdf7f2
TL
213 def test_dump_inode(self):
214 info = self.fs.mds_asok(['dump', 'inode', '1'])
215 assert(info['path'] == "/")
f64942e4 216
11fdf7f2
TL
217 def test_dump_inode_hexademical(self):
218 self.mount_a.run_shell(["mkdir", "-p", "foo"])
219 ino = self.mount_a.path_to_ino("foo")
220 assert type(ino) is int
221 info = self.fs.mds_asok(['dump', 'inode', hex(ino)])
222 assert info['path'] == "/foo"
f64942e4 223
20effc67
TL
224 def test_fs_lsflags(self):
225 """
226 Check that the lsflags displays the default state and the new state of flags
227 """
228 # Set some flags
229 self.fs.set_joinable(False)
230 self.fs.set_allow_new_snaps(False)
231 self.fs.set_allow_standby_replay(True)
232
233 lsflags = json.loads(self.fs.mon_manager.raw_cluster_cmd('fs', 'lsflags',
234 self.fs.name,
235 "--format=json-pretty"))
236 self.assertEqual(lsflags["joinable"], False)
237 self.assertEqual(lsflags["allow_snaps"], False)
238 self.assertEqual(lsflags["allow_multimds_snaps"], True)
239 self.assertEqual(lsflags["allow_standby_replay"], True)
f64942e4 240
33c7a0ef
TL
241 def _test_sync_stuck_for_around_5s(self, dir_path, file_sync=False):
242 self.mount_a.run_shell(["mkdir", dir_path])
243
244 sync_dir_pyscript = dedent("""
245 import os
246
247 path = "{path}"
248 dfd = os.open(path, os.O_DIRECTORY)
249 os.fsync(dfd)
250 os.close(dfd)
251 """.format(path=dir_path))
252
253 # run create/delete directories and test the sync time duration
254 for i in range(300):
255 for j in range(5):
256 self.mount_a.run_shell(["mkdir", os.path.join(dir_path, f"{i}_{j}")])
257 start = time.time()
258 if file_sync:
259 self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript])
260 else:
261 self.mount_a.run_shell(["sync"])
262 duration = time.time() - start
263 log.info(f"sync mkdir i = {i}, duration = {duration}")
264 self.assertLess(duration, 4)
265
266 for j in range(5):
267 self.mount_a.run_shell(["rm", "-rf", os.path.join(dir_path, f"{i}_{j}")])
268 start = time.time()
269 if file_sync:
270 self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript])
271 else:
272 self.mount_a.run_shell(["sync"])
273 duration = time.time() - start
274 log.info(f"sync rmdir i = {i}, duration = {duration}")
275 self.assertLess(duration, 4)
276
277 self.mount_a.run_shell(["rm", "-rf", dir_path])
278
279 def test_filesystem_sync_stuck_for_around_5s(self):
280 """
281 To check whether the fsync will be stuck to wait for the mdlog to be
282 flushed for at most 5 seconds.
283 """
284
285 dir_path = "filesystem_sync_do_not_wait_mdlog_testdir"
286 self._test_sync_stuck_for_around_5s(dir_path)
287
288 def test_file_sync_stuck_for_around_5s(self):
289 """
290 To check whether the filesystem sync will be stuck to wait for the
291 mdlog to be flushed for at most 5 seconds.
292 """
293
294 dir_path = "file_sync_do_not_wait_mdlog_testdir"
295 self._test_sync_stuck_for_around_5s(dir_path, True)
296
297 def test_file_filesystem_sync_crash(self):
298 """
299 To check whether the kernel crashes when doing the file/filesystem sync.
300 """
301
302 stop_thread = False
303 dir_path = "file_filesystem_sync_crash_testdir"
304 self.mount_a.run_shell(["mkdir", dir_path])
305
306 def mkdir_rmdir_thread(mount, path):
307 #global stop_thread
308
309 log.info(" mkdir_rmdir_thread starting...")
310 num = 0
311 while not stop_thread:
312 n = num
313 m = num
314 for __ in range(10):
315 mount.run_shell(["mkdir", os.path.join(path, f"{n}")])
316 n += 1
317 for __ in range(10):
318 mount.run_shell(["rm", "-rf", os.path.join(path, f"{m}")])
319 m += 1
320 num += 10
321 log.info(" mkdir_rmdir_thread stopped")
322
323 def filesystem_sync_thread(mount, path):
324 #global stop_thread
325
326 log.info(" filesystem_sync_thread starting...")
327 while not stop_thread:
328 mount.run_shell(["sync"])
329 log.info(" filesystem_sync_thread stopped")
330
331 def file_sync_thread(mount, path):
332 #global stop_thread
333
334 log.info(" file_sync_thread starting...")
335 pyscript = dedent("""
336 import os
337
338 path = "{path}"
339 dfd = os.open(path, os.O_DIRECTORY)
340 os.fsync(dfd)
341 os.close(dfd)
342 """.format(path=path))
343
344 while not stop_thread:
345 mount.run_shell(['python3', '-c', pyscript])
346 log.info(" file_sync_thread stopped")
347
348 td1 = Thread(target=mkdir_rmdir_thread, args=(self.mount_a, dir_path,))
349 td2 = Thread(target=filesystem_sync_thread, args=(self.mount_a, dir_path,))
350 td3 = Thread(target=file_sync_thread, args=(self.mount_a, dir_path,))
351
352 td1.start()
353 td2.start()
354 td3.start()
355 time.sleep(1200) # run 20 minutes
356 stop_thread = True
357 td1.join()
358 td2.join()
359 td3.join()
360 self.mount_a.run_shell(["rm", "-rf", dir_path])
361
362
11fdf7f2
TL
363class TestCacheDrop(CephFSTestCase):
364 CLIENTS_REQUIRED = 1
f64942e4 365
11fdf7f2
TL
366 def _run_drop_cache_cmd(self, timeout=None):
367 result = None
f67539c2 368 args = ["cache", "drop"]
11fdf7f2 369 if timeout is not None:
f67539c2
TL
370 args.append(str(timeout))
371 result = self.fs.rank_tell(args)
372 return result
f64942e4 373
11fdf7f2 374 def _setup(self, max_caps=20, threshold=400):
f64942e4 375 # create some files
11fdf7f2 376 self.mount_a.create_n_files("dc-dir/dc-file", 1000, sync=True)
f64942e4 377
11fdf7f2
TL
378 # Reduce this so the MDS doesn't rkcall the maximum for simple tests
379 self.fs.rank_asok(['config', 'set', 'mds_recall_max_caps', str(max_caps)])
380 self.fs.rank_asok(['config', 'set', 'mds_recall_max_decay_threshold', str(threshold)])
f64942e4 381
11fdf7f2 382 def test_drop_cache_command(self):
f64942e4 383 """
11fdf7f2
TL
384 Basic test for checking drop cache command.
385 Confirm it halts without a timeout.
f64942e4
AA
386 Note that the cache size post trimming is not checked here.
387 """
11fdf7f2
TL
388 mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
389 self._setup()
390 result = self._run_drop_cache_cmd()
92f5a8d4
TL
391 self.assertEqual(result['client_recall']['return_code'], 0)
392 self.assertEqual(result['flush_journal']['return_code'], 0)
11fdf7f2 393 # It should take at least 1 second
92f5a8d4 394 self.assertGreater(result['duration'], 1)
11fdf7f2
TL
395 self.assertGreaterEqual(result['trim_cache']['trimmed'], 1000-2*mds_min_caps_per_client)
396
397 def test_drop_cache_command_timeout(self):
f64942e4 398 """
11fdf7f2
TL
399 Basic test for checking drop cache command.
400 Confirm recall halts early via a timeout.
f64942e4
AA
401 Note that the cache size post trimming is not checked here.
402 """
11fdf7f2
TL
403 self._setup()
404 result = self._run_drop_cache_cmd(timeout=10)
92f5a8d4
TL
405 self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
406 self.assertEqual(result['flush_journal']['return_code'], 0)
407 self.assertGreater(result['duration'], 10)
11fdf7f2
TL
408 self.assertGreaterEqual(result['trim_cache']['trimmed'], 100) # we did something, right?
409
410 def test_drop_cache_command_dead_timeout(self):
f64942e4 411 """
11fdf7f2
TL
412 Check drop cache command with non-responding client using tell
413 interface. Note that the cache size post trimming is not checked
414 here.
f64942e4 415 """
11fdf7f2 416 self._setup()
f67539c2 417 self.mount_a.suspend_netns()
11fdf7f2
TL
418 # Note: recall is subject to the timeout. The journal flush will
419 # be delayed due to the client being dead.
420 result = self._run_drop_cache_cmd(timeout=5)
92f5a8d4
TL
421 self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT)
422 self.assertEqual(result['flush_journal']['return_code'], 0)
423 self.assertGreater(result['duration'], 5)
424 self.assertLess(result['duration'], 120)
425 # Note: result['trim_cache']['trimmed'] may be >0 because dropping the
426 # cache now causes the Locker to drive eviction of stale clients (a
427 # stale session will be autoclosed at mdsmap['session_timeout']). The
428 # particular operation causing this is journal flush which causes the
429 # MDS to wait wait for cap revoke.
430 #self.assertEqual(0, result['trim_cache']['trimmed'])
f67539c2 431 self.mount_a.resume_netns()
f64942e4 432
11fdf7f2 433 def test_drop_cache_command_dead(self):
f64942e4
AA
434 """
435 Check drop cache command with non-responding client using tell
436 interface. Note that the cache size post trimming is not checked
437 here.
438 """
11fdf7f2 439 self._setup()
f67539c2 440 self.mount_a.suspend_netns()
11fdf7f2 441 result = self._run_drop_cache_cmd()
92f5a8d4
TL
442 self.assertEqual(result['client_recall']['return_code'], 0)
443 self.assertEqual(result['flush_journal']['return_code'], 0)
444 self.assertGreater(result['duration'], 5)
445 self.assertLess(result['duration'], 120)
446 # Note: result['trim_cache']['trimmed'] may be >0 because dropping the
447 # cache now causes the Locker to drive eviction of stale clients (a
448 # stale session will be autoclosed at mdsmap['session_timeout']). The
449 # particular operation causing this is journal flush which causes the
450 # MDS to wait wait for cap revoke.
f67539c2 451 self.mount_a.resume_netns()