]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/cephfs/cephfs_test_case.py
import ceph 16.2.6
[ceph.git] / ceph / qa / tasks / cephfs / cephfs_test_case.py
CommitLineData
7c673cae
FG
1import json
2import logging
7c673cae
FG
3import os
4import re
7c673cae 5
f67539c2 6from shlex import split as shlex_split
f67539c2
TL
7
8from tasks.ceph_test_case import CephTestCase
7c673cae 9
f6b5b4d7 10from teuthology import contextutil
7c673cae
FG
11from teuthology.orchestra import run
12from teuthology.orchestra.run import CommandFailedError
7c673cae
FG
13
14log = logging.getLogger(__name__)
15
7c673cae
FG
16def for_teuthology(f):
17 """
18 Decorator that adds an "is_for_teuthology" attribute to the wrapped function
19 """
20 f.is_for_teuthology = True
21 return f
22
23
24def needs_trimming(f):
25 """
26 Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
27 this means it needs to be able to run as root, currently)
28 """
29 f.needs_trimming = True
30 return f
31
32
f67539c2
TL
33class MountDetails():
34
35 def __init__(self, mntobj):
36 self.client_id = mntobj.client_id
37 self.client_keyring_path = mntobj.client_keyring_path
38 self.client_remote = mntobj.client_remote
39 self.cephfs_name = mntobj.cephfs_name
40 self.cephfs_mntpt = mntobj.cephfs_mntpt
41 self.hostfs_mntpt = mntobj.hostfs_mntpt
42
43 def restore(self, mntobj):
44 mntobj.client_id = self.client_id
45 mntobj.client_keyring_path = self.client_keyring_path
46 mntobj.client_remote = self.client_remote
47 mntobj.cephfs_name = self.cephfs_name
48 mntobj.cephfs_mntpt = self.cephfs_mntpt
49 mntobj.hostfs_mntpt = self.hostfs_mntpt
50
51
7c673cae
FG
52class CephFSTestCase(CephTestCase):
53 """
54 Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
55 into the fs, mount_a, mount_b class attributes (setting mount_b is optional)
56
57 Handles resetting the cluster under test between tests.
58 """
59
60 # FIXME weird explicit naming
61 mount_a = None
62 mount_b = None
181888fb 63 recovery_mount = None
7c673cae
FG
64
65 # Declarative test requirements: subclasses should override these to indicate
66 # their special needs. If not met, tests will be skipped.
67 CLIENTS_REQUIRED = 1
68 MDSS_REQUIRED = 1
69 REQUIRE_KCLIENT_REMOTE = False
70 REQUIRE_ONE_CLIENT_REMOTE = False
7c673cae
FG
71
72 # Whether to create the default filesystem during setUp
73 REQUIRE_FILESYSTEM = True
74
181888fb
FG
75 # requires REQUIRE_FILESYSTEM = True
76 REQUIRE_RECOVERY_FILESYSTEM = False
77
f67539c2
TL
78 # create a backup filesystem if required.
79 # required REQUIRE_FILESYSTEM enabled
80 REQUIRE_BACKUP_FILESYSTEM = False
81
9f95a23c 82 LOAD_SETTINGS = [] # type: ignore
7c673cae 83
f67539c2
TL
84 def _save_mount_details(self):
85 """
86 XXX: Tests may change details of mount objects, so let's stash them so
87 that these details are restored later to ensure smooth setUps and
88 tearDowns for upcoming tests.
89 """
90 self._orig_mount_details = [MountDetails(m) for m in self.mounts]
91 log.info(self._orig_mount_details)
92
7c673cae
FG
93 def setUp(self):
94 super(CephFSTestCase, self).setUp()
95
f91f0fd5
TL
96 self.config_set('mon', 'mon_allow_pool_delete', True)
97
7c673cae 98 if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
9f95a23c 99 self.skipTest("Only have {0} MDSs, require {1}".format(
7c673cae
FG
100 len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
101 ))
102
103 if len(self.mounts) < self.CLIENTS_REQUIRED:
9f95a23c 104 self.skipTest("Only have {0} clients, require {1}".format(
7c673cae
FG
105 len(self.mounts), self.CLIENTS_REQUIRED
106 ))
107
7c673cae
FG
108 if self.REQUIRE_ONE_CLIENT_REMOTE:
109 if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
9f95a23c 110 self.skipTest("Require first client to be on separate server from MDSs")
7c673cae 111
7c673cae
FG
112 # Create friendly mount_a, mount_b attrs
113 for i in range(0, self.CLIENTS_REQUIRED):
114 setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])
115
116 self.mds_cluster.clear_firewall()
117
118 # Unmount all clients, we are about to blow away the filesystem
119 for mount in self.mounts:
120 if mount.is_mounted():
121 mount.umount_wait(force=True)
f67539c2 122 self._save_mount_details()
7c673cae
FG
123
124 # To avoid any issues with e.g. unlink bugs, we destroy and recreate
125 # the filesystem rather than just doing a rm -rf of files
7c673cae 126 self.mds_cluster.delete_all_filesystems()
92f5a8d4 127 self.mds_cluster.mds_restart() # to reset any run-time configs, etc.
7c673cae 128 self.fs = None # is now invalid!
f67539c2 129 self.backup_fs = None
181888fb 130 self.recovery_fs = None
7c673cae 131
f67539c2
TL
132 # In case anything is in the OSD blocklist list, clear it out. This is to avoid
133 # the OSD map changing in the background (due to blocklist expiry) while tests run.
7c673cae 134 try:
f67539c2 135 self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blocklist", "clear")
7c673cae
FG
136 except CommandFailedError:
137 # Fallback for older Ceph cluster
f67539c2
TL
138 blocklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
139 "dump", "--format=json-pretty"))['blocklist']
140 log.info("Removing {0} blocklist entries".format(len(blocklist)))
141 for addr, blocklisted_at in blocklist.items():
142 self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blocklist", "rm", addr)
7c673cae
FG
143
144 client_mount_ids = [m.client_id for m in self.mounts]
7c673cae
FG
145 # In case there were any extra auth identities around from a previous
146 # test, delete them
147 for entry in self.auth_list():
148 ent_type, ent_id = entry['entity'].split(".")
f67539c2 149 if ent_type == "client" and ent_id not in client_mount_ids and not (ent_id == "admin" or ent_id[:6] == 'mirror'):
7c673cae
FG
150 self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
151
152 if self.REQUIRE_FILESYSTEM:
181888fb 153 self.fs = self.mds_cluster.newfs(create=True)
7c673cae
FG
154
155 # In case some test messed with auth caps, reset them
156 for client_id in client_mount_ids:
f67539c2
TL
157 cmd = ['auth', 'caps', f'client.{client_id}', 'mon','allow r',
158 'osd', f'allow rw pool={self.fs.get_data_pool_name()}',
159 'mds', 'allow']
160
161 if self.run_cluster_cmd_result(cmd) == 0:
162 break
163
164 cmd[1] = 'add'
165 if self.run_cluster_cmd_result(cmd) != 0:
166 raise RuntimeError(f'Failed to create new client {cmd[2]}')
7c673cae 167
92f5a8d4 168 # wait for ranks to become active
7c673cae
FG
169 self.fs.wait_for_daemons()
170
171 # Mount the requested number of clients
172 for i in range(0, self.CLIENTS_REQUIRED):
e306af50 173 self.mounts[i].mount_wait()
7c673cae 174
f67539c2
TL
175 if self.REQUIRE_BACKUP_FILESYSTEM:
176 if not self.REQUIRE_FILESYSTEM:
177 self.skipTest("backup filesystem requires a primary filesystem as well")
178 self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
179 'enable_multiple', 'true',
180 '--yes-i-really-mean-it')
181 self.backup_fs = self.mds_cluster.newfs(name="backup_fs")
182 self.backup_fs.wait_for_daemons()
183
181888fb
FG
184 if self.REQUIRE_RECOVERY_FILESYSTEM:
185 if not self.REQUIRE_FILESYSTEM:
9f95a23c 186 self.skipTest("Recovery filesystem requires a primary filesystem as well")
f67539c2 187 # After Octopus is EOL, we can remove this setting:
181888fb
FG
188 self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
189 'enable_multiple', 'true',
190 '--yes-i-really-mean-it')
191 self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
192 self.recovery_fs.set_metadata_overlay(True)
193 self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
194 self.recovery_fs.create()
195 self.recovery_fs.getinfo(refresh=True)
181888fb
FG
196 self.recovery_fs.wait_for_daemons()
197
7c673cae
FG
198 # Load an config settings of interest
199 for setting in self.LOAD_SETTINGS:
c07f9fc5 200 setattr(self, setting, float(self.fs.mds_asok(
e306af50 201 ['config', 'get', setting], list(self.mds_cluster.mds_ids)[0]
7c673cae
FG
202 )[setting]))
203
204 self.configs_set = set()
205
206 def tearDown(self):
7c673cae
FG
207 self.mds_cluster.clear_firewall()
208 for m in self.mounts:
209 m.teardown()
210
f67539c2
TL
211 # To prevent failover messages during Unwind of ceph task
212 self.mds_cluster.delete_all_filesystems()
213
214 for m, md in zip(self.mounts, self._orig_mount_details):
215 md.restore(m)
7c673cae
FG
216
217 for subsys, key in self.configs_set:
218 self.mds_cluster.clear_ceph_conf(subsys, key)
219
9f95a23c
TL
220 return super(CephFSTestCase, self).tearDown()
221
7c673cae
FG
222 def set_conf(self, subsys, key, value):
223 self.configs_set.add((subsys, key))
224 self.mds_cluster.set_ceph_conf(subsys, key, value)
225
226 def auth_list(self):
227 """
c07f9fc5 228 Convenience wrapper on "ceph auth ls"
7c673cae
FG
229 """
230 return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
c07f9fc5 231 "auth", "ls", "--format=json-pretty"
7c673cae
FG
232 ))['auth_dump']
233
234 def assert_session_count(self, expected, ls_data=None, mds_id=None):
235 if ls_data is None:
236 ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)
237
31f18b77
FG
238 alive_count = len([s for s in ls_data if s['state'] != 'killing'])
239
240 self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format(
241 expected, alive_count
7c673cae
FG
242 ))
243
244 def assert_session_state(self, client_id, expected_state):
245 self.assertEqual(
246 self._session_by_id(
247 self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
248 expected_state)
249
250 def get_session_data(self, client_id):
251 return self._session_by_id(client_id)
252
253 def _session_list(self):
254 ls_data = self.fs.mds_asok(['session', 'ls'])
255 ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
256 return ls_data
257
258 def get_session(self, client_id, session_ls=None):
259 if session_ls is None:
260 session_ls = self.fs.mds_asok(['session', 'ls'])
261
262 return self._session_by_id(session_ls)[client_id]
263
264 def _session_by_id(self, session_ls):
265 return dict([(s['id'], s) for s in session_ls])
266
adb31ebb
TL
267 def perf_dump(self, rank=None, status=None):
268 return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status)
269
92f5a8d4
TL
270 def wait_until_evicted(self, client_id, timeout=30):
271 def is_client_evicted():
272 ls = self._session_list()
273 for s in ls:
274 if s['id'] == client_id:
275 return False
276 return True
277 self.wait_until_true(is_client_evicted, timeout)
278
7c673cae
FG
279 def wait_for_daemon_start(self, daemon_ids=None):
280 """
281 Wait until all the daemons appear in the FSMap, either assigned
282 MDS ranks or in the list of standbys
283 """
284 def get_daemon_names():
285 return [info['name'] for info in self.mds_cluster.status().get_all()]
286
287 if daemon_ids is None:
288 daemon_ids = self.mds_cluster.mds_ids
289
290 try:
291 self.wait_until_true(
292 lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
293 timeout=30
294 )
295 except RuntimeError:
e306af50 296 log.warning("Timeout waiting for daemons {0}, while we have {1}".format(
7c673cae
FG
297 daemon_ids, get_daemon_names()
298 ))
299 raise
300
11fdf7f2
TL
301 def delete_mds_coredump(self, daemon_id):
302 # delete coredump file, otherwise teuthology.internal.coredump will
303 # catch it later and treat it as a failure.
e306af50
TL
304 core_pattern = self.mds_cluster.mds_daemons[daemon_id].remote.sh(
305 "sudo sysctl -n kernel.core_pattern")
306 core_dir = os.path.dirname(core_pattern.strip())
11fdf7f2
TL
307 if core_dir: # Non-default core_pattern with a directory in it
308 # We have seen a core_pattern that looks like it's from teuthology's coredump
309 # task, so proceed to clear out the core file
f6b5b4d7
TL
310 if core_dir[0] == '|':
311 log.info("Piped core dumps to program {0}, skip cleaning".format(core_dir[1:]))
312 return;
313
11fdf7f2
TL
314 log.info("Clearing core from directory: {0}".format(core_dir))
315
316 # Verify that we see the expected single coredump
e306af50 317 ls_output = self.mds_cluster.mds_daemons[daemon_id].remote.sh([
11fdf7f2
TL
318 "cd", core_dir, run.Raw('&&'),
319 "sudo", "ls", run.Raw('|'), "sudo", "xargs", "file"
e306af50 320 ])
11fdf7f2 321 cores = [l.partition(":")[0]
e306af50 322 for l in ls_output.strip().split("\n")
11fdf7f2
TL
323 if re.match(r'.*ceph-mds.* -i +{0}'.format(daemon_id), l)]
324
325 log.info("Enumerated cores: {0}".format(cores))
326 self.assertEqual(len(cores), 1)
327
328 log.info("Found core file {0}, deleting it".format(cores[0]))
329
330 self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
331 "cd", core_dir, run.Raw('&&'), "sudo", "rm", "-f", cores[0]
332 ])
7c673cae 333 else:
11fdf7f2 334 log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
81eedcae 335
f6b5b4d7
TL
336 def _get_subtrees(self, status=None, rank=None, path=None):
337 if path is None:
338 path = "/"
339 try:
340 with contextutil.safe_while(sleep=1, tries=3) as proceed:
341 while proceed():
342 try:
343 if rank == "all":
344 subtrees = []
345 for r in self.fs.get_ranks(status=status):
346 s = self.fs.rank_asok(["get", "subtrees"], status=status, rank=r['rank'])
347 s = filter(lambda s: s['auth_first'] == r['rank'] and s['auth_second'] == -2, s)
348 subtrees += s
349 else:
350 subtrees = self.fs.rank_asok(["get", "subtrees"], status=status, rank=rank)
351 subtrees = filter(lambda s: s['dir']['path'].startswith(path), subtrees)
352 return list(subtrees)
353 except CommandFailedError as e:
354 # Sometimes we get transient errors
355 if e.exitstatus == 22:
356 pass
357 else:
358 raise
359 except contextutil.MaxWhileTries as e:
360 raise RuntimeError(f"could not get subtree state from rank {rank}") from e
361
362 def _wait_subtrees(self, test, status=None, rank=None, timeout=30, sleep=2, action=None, path=None):
81eedcae 363 test = sorted(test)
f6b5b4d7
TL
364 try:
365 with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
366 while proceed():
367 subtrees = self._get_subtrees(status=status, rank=rank, path=path)
368 filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees])
369 log.info("%s =?= %s", filtered, test)
370 if filtered == test:
371 # Confirm export_pin in output is correct:
372 for s in subtrees:
f67539c2
TL
373 if s['export_pin_target'] >= 0:
374 self.assertTrue(s['export_pin_target'] == s['auth_first'])
f6b5b4d7
TL
375 return subtrees
376 if action is not None:
377 action()
378 except contextutil.MaxWhileTries as e:
379 raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
e306af50 380
f67539c2
TL
381 def _wait_until_scrub_complete(self, path="/", recursive=True, timeout=100):
382 out_json = self.fs.run_scrub(["start", path] + ["recursive"] if recursive else [])
383 if not self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"],
384 sleep=10, timeout=timeout):
385 log.info("timed out waiting for scrub to complete")
e306af50 386
f6b5b4d7
TL
387 def _wait_distributed_subtrees(self, count, status=None, rank=None, path=None):
388 try:
389 with contextutil.safe_while(sleep=5, tries=20) as proceed:
390 while proceed():
391 subtrees = self._get_subtrees(status=status, rank=rank, path=path)
f67539c2
TL
392 subtrees = list(filter(lambda s: s['distributed_ephemeral_pin'] == True and
393 s['auth_first'] == s['export_pin_target'],
394 subtrees))
f6b5b4d7
TL
395 log.info(f"len={len(subtrees)} {subtrees}")
396 if len(subtrees) >= count:
397 return subtrees
398 except contextutil.MaxWhileTries as e:
399 raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
400
401 def _wait_random_subtrees(self, count, status=None, rank=None, path=None):
402 try:
403 with contextutil.safe_while(sleep=5, tries=20) as proceed:
404 while proceed():
405 subtrees = self._get_subtrees(status=status, rank=rank, path=path)
f67539c2
TL
406 subtrees = list(filter(lambda s: s['random_ephemeral_pin'] == True and
407 s['auth_first'] == s['export_pin_target'],
408 subtrees))
f6b5b4d7
TL
409 log.info(f"len={len(subtrees)} {subtrees}")
410 if len(subtrees) >= count:
411 return subtrees
412 except contextutil.MaxWhileTries as e:
413 raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
f67539c2
TL
414
415 def run_cluster_cmd(self, cmd):
416 if isinstance(cmd, str):
417 cmd = shlex_split(cmd)
418 return self.fs.mon_manager.raw_cluster_cmd(*cmd)
419
420 def run_cluster_cmd_result(self, cmd):
421 if isinstance(cmd, str):
422 cmd = shlex_split(cmd)
423 return self.fs.mon_manager.raw_cluster_cmd_result(*cmd)
424
425 def create_client(self, client_id, moncap=None, osdcap=None, mdscap=None):
426 if not (moncap or osdcap or mdscap):
427 if self.fs:
428 return self.fs.authorize(client_id, ('/', 'rw'))
429 else:
430 raise RuntimeError('no caps were passed and the default FS '
431 'is not created yet to allow client auth '
432 'for it.')
433
434 cmd = ['auth', 'add', f'client.{client_id}']
435 if moncap:
436 cmd += ['mon', moncap]
437 if osdcap:
438 cmd += ['osd', osdcap]
439 if mdscap:
440 cmd += ['mds', mdscap]
441
442 self.run_cluster_cmd(cmd)
443 return self.run_cluster_cmd(f'auth get {self.client_name}')
444
445 def create_keyring_file(self, remote, keyring):
522d829b 446 keyring_path = remote.mktemp(data=keyring)
f67539c2
TL
447
448 # required when triggered using vstart_runner.py.
449 remote.run(args=['chmod', '644', keyring_path])
450
451 return keyring_path