]> git.proxmox.com Git - ceph.git/blob - ceph/qa/tasks/workunit.py
import ceph 15.2.10
[ceph.git] / ceph / qa / tasks / workunit.py
1 """
2 Workunit task -- Run ceph on sets of specific clients
3 """
4 import logging
5 import pipes
6 import os
7 import re
8
9 import six
10
11 from tasks.util import get_remote_for_role
12 from tasks.util.workunit import get_refspec_after_overrides
13
14 from teuthology import misc
15 from teuthology.config import config as teuth_config
16 from teuthology.orchestra.run import CommandFailedError
17 from teuthology.parallel import parallel
18 from teuthology.orchestra import run
19
20 log = logging.getLogger(__name__)
21
22 def task(ctx, config):
23 """
24 Run ceph on all workunits found under the specified path.
25
26 For example::
27
28 tasks:
29 - ceph:
30 - ceph-fuse: [client.0]
31 - workunit:
32 clients:
33 client.0: [direct_io, xattrs.sh]
34 client.1: [snaps]
35 branch: foo
36
37 You can also run a list of workunits on all clients:
38 tasks:
39 - ceph:
40 - ceph-fuse:
41 - workunit:
42 tag: v0.47
43 clients:
44 all: [direct_io, xattrs.sh, snaps]
45
46 If you have an "all" section it will run all the workunits
47 on each client simultaneously, AFTER running any workunits specified
48 for individual clients. (This prevents unintended simultaneous runs.)
49
50 To customize tests, you can specify environment variables as a dict. You
51 can also specify a time limit for each work unit (defaults to 3h):
52
53 tasks:
54 - ceph:
55 - ceph-fuse:
56 - workunit:
57 sha1: 9b28948635b17165d17c1cf83d4a870bd138ddf6
58 clients:
59 all: [snaps]
60 env:
61 FOO: bar
62 BAZ: quux
63 timeout: 3h
64
65 This task supports roles that include a ceph cluster, e.g.::
66
67 tasks:
68 - ceph:
69 - workunit:
70 clients:
71 backup.client.0: [foo]
72 client.1: [bar] # cluster is implicitly 'ceph'
73
74 You can also specify an alternative top-level dir to 'qa/workunits', like
75 'qa/standalone', with::
76
77 tasks:
78 - install:
79 - workunit:
80 basedir: qa/standalone
81 clients:
82 client.0:
83 - test-ceph-helpers.sh
84
85 :param ctx: Context
86 :param config: Configuration
87 """
88 assert isinstance(config, dict)
89 assert isinstance(config.get('clients'), dict), \
90 'configuration must contain a dictionary of clients'
91
92 overrides = ctx.config.get('overrides', {})
93 refspec = get_refspec_after_overrides(config, overrides)
94 timeout = config.get('timeout', '3h')
95 cleanup = config.get('cleanup', True)
96
97 log.info('Pulling workunits from ref %s', refspec)
98
99 created_mountpoint = {}
100
101 if config.get('env') is not None:
102 assert isinstance(config['env'], dict), 'env must be a dictionary'
103 clients = config['clients']
104
105 # Create scratch dirs for any non-all workunits
106 log.info('Making a separate scratch dir for every client...')
107 for role in clients.keys():
108 assert isinstance(role, six.string_types)
109 if role == "all":
110 continue
111
112 assert 'client' in role
113 created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir'))
114 created_mountpoint[role] = created_mnt_dir
115
116 # Execute any non-all workunits
117 log.info("timeout={}".format(timeout))
118 log.info("cleanup={}".format(cleanup))
119 with parallel() as p:
120 for role, tests in clients.items():
121 if role != "all":
122 p.spawn(_run_tests, ctx, refspec, role, tests,
123 config.get('env'),
124 basedir=config.get('basedir','qa/workunits'),
125 timeout=timeout,
126 cleanup=cleanup,
127 coverage_and_limits=not config.get('no_coverage_and_limits', None))
128
129 if cleanup:
130 # Clean up dirs from any non-all workunits
131 for role, created in created_mountpoint.items():
132 _delete_dir(ctx, role, created)
133
134 # Execute any 'all' workunits
135 if 'all' in clients:
136 all_tasks = clients["all"]
137 _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
138 config.get('basedir', 'qa/workunits'),
139 config.get('subdir'), timeout=timeout,
140 cleanup=cleanup)
141
142
143 def _client_mountpoint(ctx, cluster, id_):
144 """
145 Returns the path to the expected mountpoint for workunits running
146 on some kind of filesystem.
147 """
148 # for compatibility with tasks like ceph-fuse that aren't cluster-aware yet,
149 # only include the cluster name in the dir if the cluster is not 'ceph'
150 if cluster == 'ceph':
151 dir_ = 'mnt.{0}'.format(id_)
152 else:
153 dir_ = 'mnt.{0}.{1}'.format(cluster, id_)
154 return os.path.join(misc.get_testdir(ctx), dir_)
155
156
157 def _delete_dir(ctx, role, created_mountpoint):
158 """
159 Delete file used by this role, and delete the directory that this
160 role appeared in.
161
162 :param ctx: Context
163 :param role: "role.#" where # is used for the role id.
164 """
165 cluster, _, id_ = misc.split_role(role)
166 remote = get_remote_for_role(ctx, role)
167 mnt = _client_mountpoint(ctx, cluster, id_)
168 client = os.path.join(mnt, 'client.{id}'.format(id=id_))
169
170 # Remove the directory inside the mount where the workunit ran
171 remote.run(
172 args=[
173 'sudo',
174 'rm',
175 '-rf',
176 '--',
177 client,
178 ],
179 )
180 log.info("Deleted dir {dir}".format(dir=client))
181
182 # If the mount was an artificially created dir, delete that too
183 if created_mountpoint:
184 remote.run(
185 args=[
186 'rmdir',
187 '--',
188 mnt,
189 ],
190 )
191 log.info("Deleted artificial mount point {dir}".format(dir=client))
192
193
194 def _make_scratch_dir(ctx, role, subdir):
195 """
196 Make scratch directories for this role. This also makes the mount
197 point if that directory does not exist.
198
199 :param ctx: Context
200 :param role: "role.#" where # is used for the role id.
201 :param subdir: use this subdir (False if not used)
202 """
203 created_mountpoint = False
204 cluster, _, id_ = misc.split_role(role)
205 remote = get_remote_for_role(ctx, role)
206 dir_owner = remote.user
207 mnt = _client_mountpoint(ctx, cluster, id_)
208 # if neither kclient nor ceph-fuse are required for a workunit,
209 # mnt may not exist. Stat and create the directory if it doesn't.
210 try:
211 remote.run(
212 args=[
213 'stat',
214 '--',
215 mnt,
216 ],
217 )
218 log.info('Did not need to create dir {dir}'.format(dir=mnt))
219 except CommandFailedError:
220 remote.run(
221 args=[
222 'mkdir',
223 '--',
224 mnt,
225 ],
226 )
227 log.info('Created dir {dir}'.format(dir=mnt))
228 created_mountpoint = True
229
230 if not subdir:
231 subdir = 'client.{id}'.format(id=id_)
232
233 if created_mountpoint:
234 remote.run(
235 args=[
236 'cd',
237 '--',
238 mnt,
239 run.Raw('&&'),
240 'mkdir',
241 '--',
242 subdir,
243 ],
244 )
245 else:
246 remote.run(
247 args=[
248 # cd first so this will fail if the mount point does
249 # not exist; pure install -d will silently do the
250 # wrong thing
251 'cd',
252 '--',
253 mnt,
254 run.Raw('&&'),
255 'sudo',
256 'install',
257 '-d',
258 '-m', '0755',
259 '--owner={user}'.format(user=dir_owner),
260 '--',
261 subdir,
262 ],
263 )
264
265 return created_mountpoint
266
267
268 def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None, cleanup=True):
269 """
270 Make a scratch directory for each client in the cluster, and then for each
271 test spawn _run_tests() for each role.
272
273 See run_tests() for parameter documentation.
274 """
275 is_client = misc.is_type('client')
276 client_remotes = {}
277 created_mountpoint = {}
278 for remote, roles_for_host in ctx.cluster.remotes.items():
279 for role in roles_for_host:
280 if is_client(role):
281 client_remotes[role] = remote
282 created_mountpoint[role] = _make_scratch_dir(ctx, role, subdir)
283
284 for unit in tests:
285 with parallel() as p:
286 for role, remote in client_remotes.items():
287 p.spawn(_run_tests, ctx, refspec, role, [unit], env,
288 basedir,
289 subdir,
290 timeout=timeout)
291
292 # cleanup the generated client directories
293 if cleanup:
294 for role, _ in client_remotes.items():
295 _delete_dir(ctx, role, created_mountpoint[role])
296
297
298 def _run_tests(ctx, refspec, role, tests, env, basedir,
299 subdir=None, timeout=None, cleanup=True,
300 coverage_and_limits=True):
301 """
302 Run the individual test. Create a scratch directory and then extract the
303 workunits from git. Make the executables, and then run the tests.
304 Clean up (remove files created) after the tests are finished.
305
306 :param ctx: Context
307 :param refspec: branch, sha1, or version tag used to identify this
308 build
309 :param tests: specific tests specified.
310 :param env: environment set in yaml file. Could be None.
311 :param subdir: subdirectory set in yaml file. Could be None
312 :param timeout: If present, use the 'timeout' command on the remote host
313 to limit execution time. Must be specified by a number
314 followed by 's' for seconds, 'm' for minutes, 'h' for
315 hours, or 'd' for days. If '0' or anything that evaluates
316 to False is passed, the 'timeout' command is not used.
317 """
318 testdir = misc.get_testdir(ctx)
319 assert isinstance(role, six.string_types)
320 cluster, type_, id_ = misc.split_role(role)
321 assert type_ == 'client'
322 remote = get_remote_for_role(ctx, role)
323 mnt = _client_mountpoint(ctx, cluster, id_)
324 # subdir so we can remove and recreate this a lot without sudo
325 if subdir is None:
326 scratch_tmp = os.path.join(mnt, 'client.{id}'.format(id=id_), 'tmp')
327 else:
328 scratch_tmp = os.path.join(mnt, subdir)
329 clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role)
330 srcdir = '{cdir}/{basedir}'.format(cdir=clonedir,
331 basedir=basedir)
332
333 git_url = teuth_config.get_ceph_qa_suite_git_url()
334 # if we are running an upgrade test, and ceph-ci does not have branches like
335 # `jewel`, so should use ceph.git as an alternative.
336 try:
337 remote.run(logger=log.getChild(role),
338 args=refspec.clone(git_url, clonedir))
339 except CommandFailedError:
340 if git_url.endswith('/ceph-ci.git'):
341 alt_git_url = git_url.replace('/ceph-ci.git', '/ceph.git')
342 elif git_url.endswith('/ceph-ci'):
343 alt_git_url = re.sub(r'/ceph-ci$', '/ceph.git', git_url)
344 else:
345 raise
346 log.info(
347 "failed to check out '%s' from %s; will also try in %s",
348 refspec,
349 git_url,
350 alt_git_url,
351 )
352 remote.run(logger=log.getChild(role),
353 args=refspec.clone(alt_git_url, clonedir))
354 remote.run(
355 logger=log.getChild(role),
356 args=[
357 'cd', '--', srcdir,
358 run.Raw('&&'),
359 'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
360 run.Raw('&&'),
361 'find', '-executable', '-type', 'f', '-printf', r'%P\0',
362 run.Raw('>{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)),
363 ],
364 )
365
366 workunits_file = '{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)
367 workunits = sorted(six.ensure_str(misc.get_file(remote, workunits_file)).split('\0'))
368 assert workunits
369
370 try:
371 assert isinstance(tests, list)
372 for spec in tests:
373 log.info('Running workunits matching %s on %s...', spec, role)
374 prefix = '{spec}/'.format(spec=spec)
375 to_run = [w for w in workunits if w == spec or w.startswith(prefix)]
376 if not to_run:
377 raise RuntimeError('Spec did not match any workunits: {spec!r}'.format(spec=spec))
378 for workunit in to_run:
379 log.info('Running workunit %s...', workunit)
380 args = [
381 'mkdir', '-p', '--', scratch_tmp,
382 run.Raw('&&'),
383 'cd', '--', scratch_tmp,
384 run.Raw('&&'),
385 run.Raw('CEPH_CLI_TEST_DUP_COMMAND=1'),
386 run.Raw('CEPH_REF={ref}'.format(ref=refspec)),
387 run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
388 run.Raw('CEPH_ARGS="--cluster {0}"'.format(cluster)),
389 run.Raw('CEPH_ID="{id}"'.format(id=id_)),
390 run.Raw('PATH=$PATH:/usr/sbin'),
391 run.Raw('CEPH_BASE={dir}'.format(dir=clonedir)),
392 run.Raw('CEPH_ROOT={dir}'.format(dir=clonedir)),
393 ]
394 if env is not None:
395 for var, val in env.items():
396 quoted_val = pipes.quote(val)
397 env_arg = '{var}={val}'.format(var=var, val=quoted_val)
398 args.append(run.Raw(env_arg))
399 if coverage_and_limits:
400 args.extend([
401 'adjust-ulimits',
402 'ceph-coverage',
403 '{tdir}/archive/coverage'.format(tdir=testdir)])
404 if timeout and timeout != '0':
405 args.extend(['timeout', timeout])
406 args.extend([
407 '{srcdir}/{workunit}'.format(
408 srcdir=srcdir,
409 workunit=workunit,
410 ),
411 ])
412 remote.run(
413 logger=log.getChild(role),
414 args=args,
415 label="workunit test {workunit}".format(workunit=workunit)
416 )
417 if cleanup:
418 args=['sudo', 'rm', '-rf', '--', scratch_tmp]
419 remote.run(logger=log.getChild(role), args=args, timeout=(60*60))
420 finally:
421 log.info('Stopping %s on %s...', tests, role)
422 args=['sudo', 'rm', '-rf', '--', workunits_file, clonedir]
423 # N.B. don't cleanup scratch_tmp! If the mount is broken then rm will hang.
424 remote.run(
425 logger=log.getChild(role),
426 args=args,
427 )