]> git.proxmox.com Git - ceph.git/blame - ceph/qa/tasks/cephfs/cephfs_test_case.py
import 15.2.9
[ceph.git] / ceph / qa / tasks / cephfs / cephfs_test_case.py
CommitLineData
7c673cae
FG
1import json
2import logging
7c673cae
FG
3from tasks.ceph_test_case import CephTestCase
4import os
5import re
7c673cae
FG
6
7from tasks.cephfs.fuse_mount import FuseMount
8
f6b5b4d7 9from teuthology import contextutil
7c673cae
FG
10from teuthology.orchestra import run
11from teuthology.orchestra.run import CommandFailedError
e306af50 12from teuthology.contextutil import safe_while
7c673cae
FG
13
14
15log = logging.getLogger(__name__)
16
17
18def for_teuthology(f):
19 """
20 Decorator that adds an "is_for_teuthology" attribute to the wrapped function
21 """
22 f.is_for_teuthology = True
23 return f
24
25
26def needs_trimming(f):
27 """
28 Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
29 this means it needs to be able to run as root, currently)
30 """
31 f.needs_trimming = True
32 return f
33
34
35class CephFSTestCase(CephTestCase):
36 """
37 Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
38 into the fs, mount_a, mount_b class attributes (setting mount_b is optional)
39
40 Handles resetting the cluster under test between tests.
41 """
42
43 # FIXME weird explicit naming
44 mount_a = None
45 mount_b = None
181888fb 46 recovery_mount = None
7c673cae
FG
47
48 # Declarative test requirements: subclasses should override these to indicate
49 # their special needs. If not met, tests will be skipped.
50 CLIENTS_REQUIRED = 1
51 MDSS_REQUIRED = 1
52 REQUIRE_KCLIENT_REMOTE = False
53 REQUIRE_ONE_CLIENT_REMOTE = False
7c673cae
FG
54
55 # Whether to create the default filesystem during setUp
56 REQUIRE_FILESYSTEM = True
57
181888fb
FG
58 # requires REQUIRE_FILESYSTEM = True
59 REQUIRE_RECOVERY_FILESYSTEM = False
60
9f95a23c 61 LOAD_SETTINGS = [] # type: ignore
7c673cae
FG
62
63 def setUp(self):
64 super(CephFSTestCase, self).setUp()
65
f91f0fd5
TL
66 self.config_set('mon', 'mon_allow_pool_delete', True)
67
7c673cae 68 if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
9f95a23c 69 self.skipTest("Only have {0} MDSs, require {1}".format(
7c673cae
FG
70 len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
71 ))
72
73 if len(self.mounts) < self.CLIENTS_REQUIRED:
9f95a23c 74 self.skipTest("Only have {0} clients, require {1}".format(
7c673cae
FG
75 len(self.mounts), self.CLIENTS_REQUIRED
76 ))
77
78 if self.REQUIRE_KCLIENT_REMOTE:
79 if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount):
80 # kclient kill() power cycles nodes, so requires clients to each be on
81 # their own node
82 if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname:
9f95a23c 83 self.skipTest("kclient clients must be on separate nodes")
7c673cae
FG
84
85 if self.REQUIRE_ONE_CLIENT_REMOTE:
86 if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
9f95a23c 87 self.skipTest("Require first client to be on separate server from MDSs")
7c673cae 88
7c673cae
FG
89 # Create friendly mount_a, mount_b attrs
90 for i in range(0, self.CLIENTS_REQUIRED):
91 setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])
92
93 self.mds_cluster.clear_firewall()
94
95 # Unmount all clients, we are about to blow away the filesystem
96 for mount in self.mounts:
97 if mount.is_mounted():
98 mount.umount_wait(force=True)
99
100 # To avoid any issues with e.g. unlink bugs, we destroy and recreate
101 # the filesystem rather than just doing a rm -rf of files
7c673cae 102 self.mds_cluster.delete_all_filesystems()
92f5a8d4 103 self.mds_cluster.mds_restart() # to reset any run-time configs, etc.
7c673cae 104 self.fs = None # is now invalid!
181888fb 105 self.recovery_fs = None
7c673cae 106
7c673cae
FG
107 # In case anything is in the OSD blacklist list, clear it out. This is to avoid
108 # the OSD map changing in the background (due to blacklist expiry) while tests run.
109 try:
110 self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear")
111 except CommandFailedError:
112 # Fallback for older Ceph cluster
113 blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
114 "dump", "--format=json-pretty"))['blacklist']
115 log.info("Removing {0} blacklist entries".format(len(blacklist)))
116 for addr, blacklisted_at in blacklist.items():
117 self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)
118
119 client_mount_ids = [m.client_id for m in self.mounts]
120 # In case the test changes the IDs of clients, stash them so that we can
121 # reset in tearDown
122 self._original_client_ids = client_mount_ids
123 log.info(client_mount_ids)
124
125 # In case there were any extra auth identities around from a previous
126 # test, delete them
127 for entry in self.auth_list():
128 ent_type, ent_id = entry['entity'].split(".")
129 if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin":
130 self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
131
132 if self.REQUIRE_FILESYSTEM:
181888fb 133 self.fs = self.mds_cluster.newfs(create=True)
7c673cae
FG
134
135 # In case some test messed with auth caps, reset them
136 for client_id in client_mount_ids:
137 self.mds_cluster.mon_manager.raw_cluster_cmd_result(
138 'auth', 'caps', "client.{0}".format(client_id),
139 'mds', 'allow',
140 'mon', 'allow r',
141 'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name()))
142
92f5a8d4 143 # wait for ranks to become active
7c673cae
FG
144 self.fs.wait_for_daemons()
145
146 # Mount the requested number of clients
147 for i in range(0, self.CLIENTS_REQUIRED):
e306af50 148 self.mounts[i].mount_wait()
7c673cae 149
181888fb
FG
150 if self.REQUIRE_RECOVERY_FILESYSTEM:
151 if not self.REQUIRE_FILESYSTEM:
9f95a23c 152 self.skipTest("Recovery filesystem requires a primary filesystem as well")
181888fb
FG
153 self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
154 'enable_multiple', 'true',
155 '--yes-i-really-mean-it')
156 self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
157 self.recovery_fs.set_metadata_overlay(True)
158 self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
159 self.recovery_fs.create()
160 self.recovery_fs.getinfo(refresh=True)
161 self.recovery_fs.mds_restart()
162 self.recovery_fs.wait_for_daemons()
163
7c673cae
FG
164 # Load an config settings of interest
165 for setting in self.LOAD_SETTINGS:
c07f9fc5 166 setattr(self, setting, float(self.fs.mds_asok(
e306af50 167 ['config', 'get', setting], list(self.mds_cluster.mds_ids)[0]
7c673cae
FG
168 )[setting]))
169
170 self.configs_set = set()
171
172 def tearDown(self):
7c673cae
FG
173 self.mds_cluster.clear_firewall()
174 for m in self.mounts:
175 m.teardown()
176
177 for i, m in enumerate(self.mounts):
178 m.client_id = self._original_client_ids[i]
179
180 for subsys, key in self.configs_set:
181 self.mds_cluster.clear_ceph_conf(subsys, key)
182
9f95a23c
TL
183 return super(CephFSTestCase, self).tearDown()
184
7c673cae
FG
185 def set_conf(self, subsys, key, value):
186 self.configs_set.add((subsys, key))
187 self.mds_cluster.set_ceph_conf(subsys, key, value)
188
189 def auth_list(self):
190 """
c07f9fc5 191 Convenience wrapper on "ceph auth ls"
7c673cae
FG
192 """
193 return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
c07f9fc5 194 "auth", "ls", "--format=json-pretty"
7c673cae
FG
195 ))['auth_dump']
196
197 def assert_session_count(self, expected, ls_data=None, mds_id=None):
198 if ls_data is None:
199 ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)
200
31f18b77
FG
201 alive_count = len([s for s in ls_data if s['state'] != 'killing'])
202
203 self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format(
204 expected, alive_count
7c673cae
FG
205 ))
206
207 def assert_session_state(self, client_id, expected_state):
208 self.assertEqual(
209 self._session_by_id(
210 self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
211 expected_state)
212
213 def get_session_data(self, client_id):
214 return self._session_by_id(client_id)
215
216 def _session_list(self):
217 ls_data = self.fs.mds_asok(['session', 'ls'])
218 ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
219 return ls_data
220
221 def get_session(self, client_id, session_ls=None):
222 if session_ls is None:
223 session_ls = self.fs.mds_asok(['session', 'ls'])
224
225 return self._session_by_id(session_ls)[client_id]
226
227 def _session_by_id(self, session_ls):
228 return dict([(s['id'], s) for s in session_ls])
229
adb31ebb
TL
230 def perf_dump(self, rank=None, status=None):
231 return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status)
232
92f5a8d4
TL
233 def wait_until_evicted(self, client_id, timeout=30):
234 def is_client_evicted():
235 ls = self._session_list()
236 for s in ls:
237 if s['id'] == client_id:
238 return False
239 return True
240 self.wait_until_true(is_client_evicted, timeout)
241
7c673cae
FG
242 def wait_for_daemon_start(self, daemon_ids=None):
243 """
244 Wait until all the daemons appear in the FSMap, either assigned
245 MDS ranks or in the list of standbys
246 """
247 def get_daemon_names():
248 return [info['name'] for info in self.mds_cluster.status().get_all()]
249
250 if daemon_ids is None:
251 daemon_ids = self.mds_cluster.mds_ids
252
253 try:
254 self.wait_until_true(
255 lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
256 timeout=30
257 )
258 except RuntimeError:
e306af50 259 log.warning("Timeout waiting for daemons {0}, while we have {1}".format(
7c673cae
FG
260 daemon_ids, get_daemon_names()
261 ))
262 raise
263
11fdf7f2
TL
264 def delete_mds_coredump(self, daemon_id):
265 # delete coredump file, otherwise teuthology.internal.coredump will
266 # catch it later and treat it as a failure.
e306af50
TL
267 core_pattern = self.mds_cluster.mds_daemons[daemon_id].remote.sh(
268 "sudo sysctl -n kernel.core_pattern")
269 core_dir = os.path.dirname(core_pattern.strip())
11fdf7f2
TL
270 if core_dir: # Non-default core_pattern with a directory in it
271 # We have seen a core_pattern that looks like it's from teuthology's coredump
272 # task, so proceed to clear out the core file
f6b5b4d7
TL
273 if core_dir[0] == '|':
274 log.info("Piped core dumps to program {0}, skip cleaning".format(core_dir[1:]))
275 return;
276
11fdf7f2
TL
277 log.info("Clearing core from directory: {0}".format(core_dir))
278
279 # Verify that we see the expected single coredump
e306af50 280 ls_output = self.mds_cluster.mds_daemons[daemon_id].remote.sh([
11fdf7f2
TL
281 "cd", core_dir, run.Raw('&&'),
282 "sudo", "ls", run.Raw('|'), "sudo", "xargs", "file"
e306af50 283 ])
11fdf7f2 284 cores = [l.partition(":")[0]
e306af50 285 for l in ls_output.strip().split("\n")
11fdf7f2
TL
286 if re.match(r'.*ceph-mds.* -i +{0}'.format(daemon_id), l)]
287
288 log.info("Enumerated cores: {0}".format(cores))
289 self.assertEqual(len(cores), 1)
290
291 log.info("Found core file {0}, deleting it".format(cores[0]))
292
293 self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
294 "cd", core_dir, run.Raw('&&'), "sudo", "rm", "-f", cores[0]
295 ])
7c673cae 296 else:
11fdf7f2 297 log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
81eedcae 298
f6b5b4d7
TL
299 def _get_subtrees(self, status=None, rank=None, path=None):
300 if path is None:
301 path = "/"
302 try:
303 with contextutil.safe_while(sleep=1, tries=3) as proceed:
304 while proceed():
305 try:
306 if rank == "all":
307 subtrees = []
308 for r in self.fs.get_ranks(status=status):
309 s = self.fs.rank_asok(["get", "subtrees"], status=status, rank=r['rank'])
310 s = filter(lambda s: s['auth_first'] == r['rank'] and s['auth_second'] == -2, s)
311 subtrees += s
312 else:
313 subtrees = self.fs.rank_asok(["get", "subtrees"], status=status, rank=rank)
314 subtrees = filter(lambda s: s['dir']['path'].startswith(path), subtrees)
315 return list(subtrees)
316 except CommandFailedError as e:
317 # Sometimes we get transient errors
318 if e.exitstatus == 22:
319 pass
320 else:
321 raise
322 except contextutil.MaxWhileTries as e:
323 raise RuntimeError(f"could not get subtree state from rank {rank}") from e
324
325 def _wait_subtrees(self, test, status=None, rank=None, timeout=30, sleep=2, action=None, path=None):
81eedcae 326 test = sorted(test)
f6b5b4d7
TL
327 try:
328 with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed:
329 while proceed():
330 subtrees = self._get_subtrees(status=status, rank=rank, path=path)
331 filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees])
332 log.info("%s =?= %s", filtered, test)
333 if filtered == test:
334 # Confirm export_pin in output is correct:
335 for s in subtrees:
336 if s['export_pin'] >= 0:
337 self.assertTrue(s['export_pin'] == s['auth_first'])
338 return subtrees
339 if action is not None:
340 action()
341 except contextutil.MaxWhileTries as e:
342 raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
e306af50
TL
343
344 def _wait_until_scrub_complete(self, path="/", recursive=True):
345 out_json = self.fs.rank_tell(["scrub", "start", path] + ["recursive"] if recursive else [])
346 with safe_while(sleep=10, tries=10) as proceed:
347 while proceed():
348 out_json = self.fs.rank_tell(["scrub", "status"])
349 if out_json['status'] == "no active scrubs running":
350 break;
351
f6b5b4d7
TL
352 def _wait_distributed_subtrees(self, count, status=None, rank=None, path=None):
353 try:
354 with contextutil.safe_while(sleep=5, tries=20) as proceed:
355 while proceed():
356 subtrees = self._get_subtrees(status=status, rank=rank, path=path)
357 subtrees = list(filter(lambda s: s['distributed_ephemeral_pin'] == True, subtrees))
358 log.info(f"len={len(subtrees)} {subtrees}")
359 if len(subtrees) >= count:
360 return subtrees
361 except contextutil.MaxWhileTries as e:
362 raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e
363
364 def _wait_random_subtrees(self, count, status=None, rank=None, path=None):
365 try:
366 with contextutil.safe_while(sleep=5, tries=20) as proceed:
367 while proceed():
368 subtrees = self._get_subtrees(status=status, rank=rank, path=path)
369 subtrees = list(filter(lambda s: s['random_ephemeral_pin'] == True, subtrees))
370 log.info(f"len={len(subtrees)} {subtrees}")
371 if len(subtrees) >= count:
372 return subtrees
373 except contextutil.MaxWhileTries as e:
374 raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e